1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csrmbcs.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,529 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2012, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_CONVERSION 1.14 + 1.15 +#include "csmatch.h" 1.16 +#include "csrmbcs.h" 1.17 + 1.18 +#include <math.h> 1.19 + 1.20 +U_NAMESPACE_BEGIN 1.21 + 1.22 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 1.23 + 1.24 +#define min(x,y) (((x)<(y))?(x):(y)) 1.25 + 1.26 +static const uint16_t commonChars_sjis [] = { 1.27 +// TODO: This set of data comes from the character frequency- 1.28 +// of-occurence analysis tool. The data needs to be moved 1.29 +// into a resource and loaded from there. 1.30 +0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 1.31 +0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 1.32 +0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 1.33 +0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 1.34 +0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 1.35 +0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 1.36 + 1.37 +static const uint16_t commonChars_euc_jp[] = { 1.38 +// TODO: This set of data comes from the character frequency- 1.39 +// of-occurence analysis tool. The data needs to be moved 1.40 +// into a resource and loaded from there. 1.41 +0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 1.42 +0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 1.43 +0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 1.44 +0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 1.45 +0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 1.46 +0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 1.47 +0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 1.48 +0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 1.49 +0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 1.50 +0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 1.51 + 1.52 +static const uint16_t commonChars_euc_kr[] = { 1.53 +// TODO: This set of data comes from the character frequency- 1.54 +// of-occurence analysis tool. The data needs to be moved 1.55 +// into a resource and loaded from there. 1.56 +0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 1.57 +0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 1.58 +0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 1.59 +0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 1.60 +0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 1.61 +0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 1.62 +0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 1.63 +0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 1.64 +0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 1.65 +0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 1.66 + 1.67 +static const uint16_t commonChars_big5[] = { 1.68 +// TODO: This set of data comes from the character frequency- 1.69 +// of-occurence analysis tool. The data needs to be moved 1.70 +// into a resource and loaded from there. 1.71 +0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 1.72 +0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 1.73 +0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 1.74 +0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 1.75 +0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 1.76 +0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 1.77 +0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 1.78 +0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 1.79 +0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 1.80 +0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 1.81 + 1.82 +static const uint16_t commonChars_gb_18030[] = { 1.83 +// TODO: This set of data comes from the character frequency- 1.84 +// of-occurence analysis tool. The data needs to be moved 1.85 +// into a resource and loaded from there. 1.86 +0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 1.87 +0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 1.88 +0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 1.89 +0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 1.90 +0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 1.91 +0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 1.92 +0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 1.93 +0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 1.94 +0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 1.95 +0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 1.96 + 1.97 +static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 1.98 +{ 1.99 + int32_t start = 0, end = len-1; 1.100 + int32_t mid = (start+end)/2; 1.101 + 1.102 + while(start <= end) { 1.103 + if(array[mid] == value) { 1.104 + return mid; 1.105 + } 1.106 + 1.107 + if(array[mid] < value){ 1.108 + start = mid+1; 1.109 + } else { 1.110 + end = mid-1; 1.111 + } 1.112 + 1.113 + mid = (start+end)/2; 1.114 + } 1.115 + 1.116 + return -1; 1.117 +} 1.118 + 1.119 +IteratedChar::IteratedChar() : 1.120 +charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) 1.121 +{ 1.122 + // nothing else to do. 1.123 +} 1.124 + 1.125 +/*void IteratedChar::reset() 1.126 +{ 1.127 + charValue = 0; 1.128 + index = -1; 1.129 + nextIndex = 0; 1.130 + error = FALSE; 1.131 + done = FALSE; 1.132 +}*/ 1.133 + 1.134 +int32_t IteratedChar::nextByte(InputText *det) 1.135 +{ 1.136 + if (nextIndex >= det->fRawLength) { 1.137 + done = TRUE; 1.138 + 1.139 + return -1; 1.140 + } 1.141 + 1.142 + return det->fRawInput[nextIndex++]; 1.143 +} 1.144 + 1.145 +CharsetRecog_mbcs::~CharsetRecog_mbcs() 1.146 +{ 1.147 + // nothing to do. 1.148 +} 1.149 + 1.150 +int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 1.151 + int32_t singleByteCharCount = 0; 1.152 + int32_t doubleByteCharCount = 0; 1.153 + int32_t commonCharCount = 0; 1.154 + int32_t badCharCount = 0; 1.155 + int32_t totalCharCount = 0; 1.156 + int32_t confidence = 0; 1.157 + IteratedChar iter; 1.158 + 1.159 + while (nextChar(&iter, det)) { 1.160 + totalCharCount++; 1.161 + 1.162 + if (iter.error) { 1.163 + badCharCount++; 1.164 + } else { 1.165 + if (iter.charValue <= 0xFF) { 1.166 + singleByteCharCount++; 1.167 + } else { 1.168 + doubleByteCharCount++; 1.169 + 1.170 + if (commonChars != 0) { 1.171 + if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ 1.172 + commonCharCount += 1; 1.173 + } 1.174 + } 1.175 + } 1.176 + } 1.177 + 1.178 + 1.179 + if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 1.180 + // Bail out early if the byte data is not matching the encoding scheme. 1.181 + // break detectBlock; 1.182 + return confidence; 1.183 + } 1.184 + } 1.185 + 1.186 + if (doubleByteCharCount <= 10 && badCharCount == 0) { 1.187 + // Not many multi-byte chars. 1.188 + if (doubleByteCharCount == 0 && totalCharCount < 10) { 1.189 + // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 1.190 + // We don't have enough data to have any confidence. 1.191 + // Statistical analysis of single byte non-ASCII charcters would probably help here. 1.192 + confidence = 0; 1.193 + } 1.194 + else { 1.195 + // ASCII or ISO file? It's probably not our encoding, 1.196 + // but is not incompatible with our encoding, so don't give it a zero. 1.197 + confidence = 10; 1.198 + } 1.199 + 1.200 + return confidence; 1.201 + } 1.202 + 1.203 + // 1.204 + // No match if there are too many characters that don't fit the encoding scheme. 1.205 + // (should we have zero tolerance for these?) 1.206 + // 1.207 + if (doubleByteCharCount < 20*badCharCount) { 1.208 + confidence = 0; 1.209 + 1.210 + return confidence; 1.211 + } 1.212 + 1.213 + if (commonChars == 0) { 1.214 + // We have no statistics on frequently occuring characters. 1.215 + // Assess confidence purely on having a reasonable number of 1.216 + // multi-byte characters (the more the better) 1.217 + confidence = 30 + doubleByteCharCount - 20*badCharCount; 1.218 + 1.219 + if (confidence > 100) { 1.220 + confidence = 100; 1.221 + } 1.222 + } else { 1.223 + // 1.224 + // Frequency of occurence statistics exist. 1.225 + // 1.226 + 1.227 + double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ 1.228 + double scaleFactor = 90.0 / maxVal; 1.229 + confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); 1.230 + 1.231 + confidence = min(confidence, 100); 1.232 + } 1.233 + 1.234 + if (confidence < 0) { 1.235 + confidence = 0; 1.236 + } 1.237 + 1.238 + return confidence; 1.239 +} 1.240 + 1.241 +CharsetRecog_sjis::~CharsetRecog_sjis() 1.242 +{ 1.243 + // nothing to do 1.244 +} 1.245 + 1.246 +UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 1.247 + it->index = it->nextIndex; 1.248 + it->error = FALSE; 1.249 + 1.250 + int32_t firstByte = it->charValue = it->nextByte(det); 1.251 + 1.252 + if (firstByte < 0) { 1.253 + return FALSE; 1.254 + } 1.255 + 1.256 + if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 1.257 + return TRUE; 1.258 + } 1.259 + 1.260 + int32_t secondByte = it->nextByte(det); 1.261 + if (secondByte >= 0) { 1.262 + it->charValue = (firstByte << 8) | secondByte; 1.263 + } 1.264 + // else we'll handle the error later. 1.265 + 1.266 + if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 1.267 + // Illegal second byte value. 1.268 + it->error = TRUE; 1.269 + } 1.270 + 1.271 + return TRUE; 1.272 +} 1.273 + 1.274 +UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 1.275 + int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); 1.276 + results->set(det, this, confidence); 1.277 + return (confidence > 0); 1.278 +} 1.279 + 1.280 +const char *CharsetRecog_sjis::getName() const 1.281 +{ 1.282 + return "Shift_JIS"; 1.283 +} 1.284 + 1.285 +const char *CharsetRecog_sjis::getLanguage() const 1.286 +{ 1.287 + return "ja"; 1.288 +} 1.289 + 1.290 +CharsetRecog_euc::~CharsetRecog_euc() 1.291 +{ 1.292 + // nothing to do 1.293 +} 1.294 + 1.295 +UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 1.296 + int32_t firstByte = 0; 1.297 + int32_t secondByte = 0; 1.298 + int32_t thirdByte = 0; 1.299 + 1.300 + it->index = it->nextIndex; 1.301 + it->error = FALSE; 1.302 + firstByte = it->charValue = it->nextByte(det); 1.303 + 1.304 + if (firstByte < 0) { 1.305 + // Ran off the end of the input data 1.306 + return FALSE; 1.307 + } 1.308 + 1.309 + if (firstByte <= 0x8D) { 1.310 + // single byte char 1.311 + return TRUE; 1.312 + } 1.313 + 1.314 + secondByte = it->nextByte(det); 1.315 + if (secondByte >= 0) { 1.316 + it->charValue = (it->charValue << 8) | secondByte; 1.317 + } 1.318 + // else we'll handle the error later. 1.319 + 1.320 + if (firstByte >= 0xA1 && firstByte <= 0xFE) { 1.321 + // Two byte Char 1.322 + if (secondByte < 0xA1) { 1.323 + it->error = TRUE; 1.324 + } 1.325 + 1.326 + return TRUE; 1.327 + } 1.328 + 1.329 + if (firstByte == 0x8E) { 1.330 + // Code Set 2. 1.331 + // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 1.332 + // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 1.333 + // We don't know which we've got. 1.334 + // Treat it like EUC-JP. If the data really was EUC-TW, the following two 1.335 + // bytes will look like a well formed 2 byte char. 1.336 + if (secondByte < 0xA1) { 1.337 + it->error = TRUE; 1.338 + } 1.339 + 1.340 + return TRUE; 1.341 + } 1.342 + 1.343 + if (firstByte == 0x8F) { 1.344 + // Code set 3. 1.345 + // Three byte total char size, two bytes of actual char value. 1.346 + thirdByte = it->nextByte(det); 1.347 + it->charValue = (it->charValue << 8) | thirdByte; 1.348 + 1.349 + if (thirdByte < 0xa1) { 1.350 + // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 1.351 + it->error = TRUE; 1.352 + } 1.353 + } 1.354 + 1.355 + return TRUE; 1.356 + 1.357 +} 1.358 + 1.359 +CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 1.360 +{ 1.361 + // nothing to do 1.362 +} 1.363 + 1.364 +const char *CharsetRecog_euc_jp::getName() const 1.365 +{ 1.366 + return "EUC-JP"; 1.367 +} 1.368 + 1.369 +const char *CharsetRecog_euc_jp::getLanguage() const 1.370 +{ 1.371 + return "ja"; 1.372 +} 1.373 + 1.374 +UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 1.375 +{ 1.376 + int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); 1.377 + results->set(det, this, confidence); 1.378 + return (confidence > 0); 1.379 +} 1.380 + 1.381 +CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 1.382 +{ 1.383 + // nothing to do 1.384 +} 1.385 + 1.386 +const char *CharsetRecog_euc_kr::getName() const 1.387 +{ 1.388 + return "EUC-KR"; 1.389 +} 1.390 + 1.391 +const char *CharsetRecog_euc_kr::getLanguage() const 1.392 +{ 1.393 + return "ko"; 1.394 +} 1.395 + 1.396 +UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 1.397 +{ 1.398 + int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); 1.399 + results->set(det, this, confidence); 1.400 + return (confidence > 0); 1.401 +} 1.402 + 1.403 +CharsetRecog_big5::~CharsetRecog_big5() 1.404 +{ 1.405 + // nothing to do 1.406 +} 1.407 + 1.408 +UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 1.409 +{ 1.410 + int32_t firstByte; 1.411 + 1.412 + it->index = it->nextIndex; 1.413 + it->error = FALSE; 1.414 + firstByte = it->charValue = it->nextByte(det); 1.415 + 1.416 + if (firstByte < 0) { 1.417 + return FALSE; 1.418 + } 1.419 + 1.420 + if (firstByte <= 0x7F || firstByte == 0xFF) { 1.421 + // single byte character. 1.422 + return TRUE; 1.423 + } 1.424 + 1.425 + int32_t secondByte = it->nextByte(det); 1.426 + if (secondByte >= 0) { 1.427 + it->charValue = (it->charValue << 8) | secondByte; 1.428 + } 1.429 + // else we'll handle the error later. 1.430 + 1.431 + if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 1.432 + it->error = TRUE; 1.433 + } 1.434 + 1.435 + return TRUE; 1.436 +} 1.437 + 1.438 +const char *CharsetRecog_big5::getName() const 1.439 +{ 1.440 + return "Big5"; 1.441 +} 1.442 + 1.443 +const char *CharsetRecog_big5::getLanguage() const 1.444 +{ 1.445 + return "zh"; 1.446 +} 1.447 + 1.448 +UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 1.449 +{ 1.450 + int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); 1.451 + results->set(det, this, confidence); 1.452 + return (confidence > 0); 1.453 +} 1.454 + 1.455 +CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 1.456 +{ 1.457 + // nothing to do 1.458 +} 1.459 + 1.460 +UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 1.461 + int32_t firstByte = 0; 1.462 + int32_t secondByte = 0; 1.463 + int32_t thirdByte = 0; 1.464 + int32_t fourthByte = 0; 1.465 + 1.466 + it->index = it->nextIndex; 1.467 + it->error = FALSE; 1.468 + firstByte = it->charValue = it->nextByte(det); 1.469 + 1.470 + if (firstByte < 0) { 1.471 + // Ran off the end of the input data 1.472 + return FALSE; 1.473 + } 1.474 + 1.475 + if (firstByte <= 0x80) { 1.476 + // single byte char 1.477 + return TRUE; 1.478 + } 1.479 + 1.480 + secondByte = it->nextByte(det); 1.481 + if (secondByte >= 0) { 1.482 + it->charValue = (it->charValue << 8) | secondByte; 1.483 + } 1.484 + // else we'll handle the error later. 1.485 + 1.486 + if (firstByte >= 0x81 && firstByte <= 0xFE) { 1.487 + // Two byte Char 1.488 + if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 1.489 + return TRUE; 1.490 + } 1.491 + 1.492 + // Four byte char 1.493 + if (secondByte >= 0x30 && secondByte <= 0x39) { 1.494 + thirdByte = it->nextByte(det); 1.495 + 1.496 + if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 1.497 + fourthByte = it->nextByte(det); 1.498 + 1.499 + if (fourthByte >= 0x30 && fourthByte <= 0x39) { 1.500 + it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 1.501 + 1.502 + return TRUE; 1.503 + } 1.504 + } 1.505 + } 1.506 + 1.507 + // Something wasn't valid, or we ran out of data (-1). 1.508 + it->error = TRUE; 1.509 + } 1.510 + 1.511 + return TRUE; 1.512 +} 1.513 + 1.514 +const char *CharsetRecog_gb_18030::getName() const 1.515 +{ 1.516 + return "GB18030"; 1.517 +} 1.518 + 1.519 +const char *CharsetRecog_gb_18030::getLanguage() const 1.520 +{ 1.521 + return "zh"; 1.522 +} 1.523 + 1.524 +UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 1.525 +{ 1.526 + int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); 1.527 + results->set(det, this, confidence); 1.528 + return (confidence > 0); 1.529 +} 1.530 + 1.531 +U_NAMESPACE_END 1.532 +#endif