intl/icu/source/i18n/csrmbcs.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_CONVERSION
michael@0 11
michael@0 12 #include "csmatch.h"
michael@0 13 #include "csrmbcs.h"
michael@0 14
michael@0 15 #include <math.h>
michael@0 16
michael@0 17 U_NAMESPACE_BEGIN
michael@0 18
michael@0 19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
michael@0 20
michael@0 21 #define min(x,y) (((x)<(y))?(x):(y))
michael@0 22
michael@0 23 static const uint16_t commonChars_sjis [] = {
michael@0 24 // TODO: This set of data comes from the character frequency-
michael@0 25 // of-occurence analysis tool. The data needs to be moved
michael@0 26 // into a resource and loaded from there.
michael@0 27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
michael@0 28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
michael@0 29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
michael@0 30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
michael@0 31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
michael@0 32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
michael@0 33
michael@0 34 static const uint16_t commonChars_euc_jp[] = {
michael@0 35 // TODO: This set of data comes from the character frequency-
michael@0 36 // of-occurence analysis tool. The data needs to be moved
michael@0 37 // into a resource and loaded from there.
michael@0 38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
michael@0 39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
michael@0 40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
michael@0 41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
michael@0 42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
michael@0 43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
michael@0 44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
michael@0 45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
michael@0 46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
michael@0 47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
michael@0 48
michael@0 49 static const uint16_t commonChars_euc_kr[] = {
michael@0 50 // TODO: This set of data comes from the character frequency-
michael@0 51 // of-occurence analysis tool. The data needs to be moved
michael@0 52 // into a resource and loaded from there.
michael@0 53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
michael@0 54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
michael@0 55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
michael@0 56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
michael@0 57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
michael@0 58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
michael@0 59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
michael@0 60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
michael@0 61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
michael@0 62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
michael@0 63
michael@0 64 static const uint16_t commonChars_big5[] = {
michael@0 65 // TODO: This set of data comes from the character frequency-
michael@0 66 // of-occurence analysis tool. The data needs to be moved
michael@0 67 // into a resource and loaded from there.
michael@0 68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
michael@0 69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
michael@0 70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
michael@0 71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
michael@0 72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
michael@0 73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
michael@0 74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
michael@0 75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
michael@0 76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
michael@0 77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
michael@0 78
michael@0 79 static const uint16_t commonChars_gb_18030[] = {
michael@0 80 // TODO: This set of data comes from the character frequency-
michael@0 81 // of-occurence analysis tool. The data needs to be moved
michael@0 82 // into a resource and loaded from there.
michael@0 83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
michael@0 84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
michael@0 85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
michael@0 86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
michael@0 87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
michael@0 88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
michael@0 89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
michael@0 90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
michael@0 91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
michael@0 92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
michael@0 93
michael@0 94 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
michael@0 95 {
michael@0 96 int32_t start = 0, end = len-1;
michael@0 97 int32_t mid = (start+end)/2;
michael@0 98
michael@0 99 while(start <= end) {
michael@0 100 if(array[mid] == value) {
michael@0 101 return mid;
michael@0 102 }
michael@0 103
michael@0 104 if(array[mid] < value){
michael@0 105 start = mid+1;
michael@0 106 } else {
michael@0 107 end = mid-1;
michael@0 108 }
michael@0 109
michael@0 110 mid = (start+end)/2;
michael@0 111 }
michael@0 112
michael@0 113 return -1;
michael@0 114 }
michael@0 115
michael@0 116 IteratedChar::IteratedChar() :
michael@0 117 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
michael@0 118 {
michael@0 119 // nothing else to do.
michael@0 120 }
michael@0 121
michael@0 122 /*void IteratedChar::reset()
michael@0 123 {
michael@0 124 charValue = 0;
michael@0 125 index = -1;
michael@0 126 nextIndex = 0;
michael@0 127 error = FALSE;
michael@0 128 done = FALSE;
michael@0 129 }*/
michael@0 130
michael@0 131 int32_t IteratedChar::nextByte(InputText *det)
michael@0 132 {
michael@0 133 if (nextIndex >= det->fRawLength) {
michael@0 134 done = TRUE;
michael@0 135
michael@0 136 return -1;
michael@0 137 }
michael@0 138
michael@0 139 return det->fRawInput[nextIndex++];
michael@0 140 }
michael@0 141
michael@0 142 CharsetRecog_mbcs::~CharsetRecog_mbcs()
michael@0 143 {
michael@0 144 // nothing to do.
michael@0 145 }
michael@0 146
michael@0 147 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
michael@0 148 int32_t singleByteCharCount = 0;
michael@0 149 int32_t doubleByteCharCount = 0;
michael@0 150 int32_t commonCharCount = 0;
michael@0 151 int32_t badCharCount = 0;
michael@0 152 int32_t totalCharCount = 0;
michael@0 153 int32_t confidence = 0;
michael@0 154 IteratedChar iter;
michael@0 155
michael@0 156 while (nextChar(&iter, det)) {
michael@0 157 totalCharCount++;
michael@0 158
michael@0 159 if (iter.error) {
michael@0 160 badCharCount++;
michael@0 161 } else {
michael@0 162 if (iter.charValue <= 0xFF) {
michael@0 163 singleByteCharCount++;
michael@0 164 } else {
michael@0 165 doubleByteCharCount++;
michael@0 166
michael@0 167 if (commonChars != 0) {
michael@0 168 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
michael@0 169 commonCharCount += 1;
michael@0 170 }
michael@0 171 }
michael@0 172 }
michael@0 173 }
michael@0 174
michael@0 175
michael@0 176 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
michael@0 177 // Bail out early if the byte data is not matching the encoding scheme.
michael@0 178 // break detectBlock;
michael@0 179 return confidence;
michael@0 180 }
michael@0 181 }
michael@0 182
michael@0 183 if (doubleByteCharCount <= 10 && badCharCount == 0) {
michael@0 184 // Not many multi-byte chars.
michael@0 185 if (doubleByteCharCount == 0 && totalCharCount < 10) {
michael@0 186 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
michael@0 187 // We don't have enough data to have any confidence.
michael@0 188 // Statistical analysis of single byte non-ASCII charcters would probably help here.
michael@0 189 confidence = 0;
michael@0 190 }
michael@0 191 else {
michael@0 192 // ASCII or ISO file? It's probably not our encoding,
michael@0 193 // but is not incompatible with our encoding, so don't give it a zero.
michael@0 194 confidence = 10;
michael@0 195 }
michael@0 196
michael@0 197 return confidence;
michael@0 198 }
michael@0 199
michael@0 200 //
michael@0 201 // No match if there are too many characters that don't fit the encoding scheme.
michael@0 202 // (should we have zero tolerance for these?)
michael@0 203 //
michael@0 204 if (doubleByteCharCount < 20*badCharCount) {
michael@0 205 confidence = 0;
michael@0 206
michael@0 207 return confidence;
michael@0 208 }
michael@0 209
michael@0 210 if (commonChars == 0) {
michael@0 211 // We have no statistics on frequently occuring characters.
michael@0 212 // Assess confidence purely on having a reasonable number of
michael@0 213 // multi-byte characters (the more the better)
michael@0 214 confidence = 30 + doubleByteCharCount - 20*badCharCount;
michael@0 215
michael@0 216 if (confidence > 100) {
michael@0 217 confidence = 100;
michael@0 218 }
michael@0 219 } else {
michael@0 220 //
michael@0 221 // Frequency of occurence statistics exist.
michael@0 222 //
michael@0 223
michael@0 224 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
michael@0 225 double scaleFactor = 90.0 / maxVal;
michael@0 226 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
michael@0 227
michael@0 228 confidence = min(confidence, 100);
michael@0 229 }
michael@0 230
michael@0 231 if (confidence < 0) {
michael@0 232 confidence = 0;
michael@0 233 }
michael@0 234
michael@0 235 return confidence;
michael@0 236 }
michael@0 237
michael@0 238 CharsetRecog_sjis::~CharsetRecog_sjis()
michael@0 239 {
michael@0 240 // nothing to do
michael@0 241 }
michael@0 242
michael@0 243 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
michael@0 244 it->index = it->nextIndex;
michael@0 245 it->error = FALSE;
michael@0 246
michael@0 247 int32_t firstByte = it->charValue = it->nextByte(det);
michael@0 248
michael@0 249 if (firstByte < 0) {
michael@0 250 return FALSE;
michael@0 251 }
michael@0 252
michael@0 253 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
michael@0 254 return TRUE;
michael@0 255 }
michael@0 256
michael@0 257 int32_t secondByte = it->nextByte(det);
michael@0 258 if (secondByte >= 0) {
michael@0 259 it->charValue = (firstByte << 8) | secondByte;
michael@0 260 }
michael@0 261 // else we'll handle the error later.
michael@0 262
michael@0 263 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
michael@0 264 // Illegal second byte value.
michael@0 265 it->error = TRUE;
michael@0 266 }
michael@0 267
michael@0 268 return TRUE;
michael@0 269 }
michael@0 270
michael@0 271 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
michael@0 272 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
michael@0 273 results->set(det, this, confidence);
michael@0 274 return (confidence > 0);
michael@0 275 }
michael@0 276
michael@0 277 const char *CharsetRecog_sjis::getName() const
michael@0 278 {
michael@0 279 return "Shift_JIS";
michael@0 280 }
michael@0 281
michael@0 282 const char *CharsetRecog_sjis::getLanguage() const
michael@0 283 {
michael@0 284 return "ja";
michael@0 285 }
michael@0 286
michael@0 287 CharsetRecog_euc::~CharsetRecog_euc()
michael@0 288 {
michael@0 289 // nothing to do
michael@0 290 }
michael@0 291
michael@0 292 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
michael@0 293 int32_t firstByte = 0;
michael@0 294 int32_t secondByte = 0;
michael@0 295 int32_t thirdByte = 0;
michael@0 296
michael@0 297 it->index = it->nextIndex;
michael@0 298 it->error = FALSE;
michael@0 299 firstByte = it->charValue = it->nextByte(det);
michael@0 300
michael@0 301 if (firstByte < 0) {
michael@0 302 // Ran off the end of the input data
michael@0 303 return FALSE;
michael@0 304 }
michael@0 305
michael@0 306 if (firstByte <= 0x8D) {
michael@0 307 // single byte char
michael@0 308 return TRUE;
michael@0 309 }
michael@0 310
michael@0 311 secondByte = it->nextByte(det);
michael@0 312 if (secondByte >= 0) {
michael@0 313 it->charValue = (it->charValue << 8) | secondByte;
michael@0 314 }
michael@0 315 // else we'll handle the error later.
michael@0 316
michael@0 317 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
michael@0 318 // Two byte Char
michael@0 319 if (secondByte < 0xA1) {
michael@0 320 it->error = TRUE;
michael@0 321 }
michael@0 322
michael@0 323 return TRUE;
michael@0 324 }
michael@0 325
michael@0 326 if (firstByte == 0x8E) {
michael@0 327 // Code Set 2.
michael@0 328 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
michael@0 329 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
michael@0 330 // We don't know which we've got.
michael@0 331 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
michael@0 332 // bytes will look like a well formed 2 byte char.
michael@0 333 if (secondByte < 0xA1) {
michael@0 334 it->error = TRUE;
michael@0 335 }
michael@0 336
michael@0 337 return TRUE;
michael@0 338 }
michael@0 339
michael@0 340 if (firstByte == 0x8F) {
michael@0 341 // Code set 3.
michael@0 342 // Three byte total char size, two bytes of actual char value.
michael@0 343 thirdByte = it->nextByte(det);
michael@0 344 it->charValue = (it->charValue << 8) | thirdByte;
michael@0 345
michael@0 346 if (thirdByte < 0xa1) {
michael@0 347 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
michael@0 348 it->error = TRUE;
michael@0 349 }
michael@0 350 }
michael@0 351
michael@0 352 return TRUE;
michael@0 353
michael@0 354 }
michael@0 355
michael@0 356 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
michael@0 357 {
michael@0 358 // nothing to do
michael@0 359 }
michael@0 360
michael@0 361 const char *CharsetRecog_euc_jp::getName() const
michael@0 362 {
michael@0 363 return "EUC-JP";
michael@0 364 }
michael@0 365
michael@0 366 const char *CharsetRecog_euc_jp::getLanguage() const
michael@0 367 {
michael@0 368 return "ja";
michael@0 369 }
michael@0 370
michael@0 371 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
michael@0 372 {
michael@0 373 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
michael@0 374 results->set(det, this, confidence);
michael@0 375 return (confidence > 0);
michael@0 376 }
michael@0 377
michael@0 378 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
michael@0 379 {
michael@0 380 // nothing to do
michael@0 381 }
michael@0 382
michael@0 383 const char *CharsetRecog_euc_kr::getName() const
michael@0 384 {
michael@0 385 return "EUC-KR";
michael@0 386 }
michael@0 387
michael@0 388 const char *CharsetRecog_euc_kr::getLanguage() const
michael@0 389 {
michael@0 390 return "ko";
michael@0 391 }
michael@0 392
michael@0 393 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
michael@0 394 {
michael@0 395 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
michael@0 396 results->set(det, this, confidence);
michael@0 397 return (confidence > 0);
michael@0 398 }
michael@0 399
michael@0 400 CharsetRecog_big5::~CharsetRecog_big5()
michael@0 401 {
michael@0 402 // nothing to do
michael@0 403 }
michael@0 404
michael@0 405 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
michael@0 406 {
michael@0 407 int32_t firstByte;
michael@0 408
michael@0 409 it->index = it->nextIndex;
michael@0 410 it->error = FALSE;
michael@0 411 firstByte = it->charValue = it->nextByte(det);
michael@0 412
michael@0 413 if (firstByte < 0) {
michael@0 414 return FALSE;
michael@0 415 }
michael@0 416
michael@0 417 if (firstByte <= 0x7F || firstByte == 0xFF) {
michael@0 418 // single byte character.
michael@0 419 return TRUE;
michael@0 420 }
michael@0 421
michael@0 422 int32_t secondByte = it->nextByte(det);
michael@0 423 if (secondByte >= 0) {
michael@0 424 it->charValue = (it->charValue << 8) | secondByte;
michael@0 425 }
michael@0 426 // else we'll handle the error later.
michael@0 427
michael@0 428 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
michael@0 429 it->error = TRUE;
michael@0 430 }
michael@0 431
michael@0 432 return TRUE;
michael@0 433 }
michael@0 434
michael@0 435 const char *CharsetRecog_big5::getName() const
michael@0 436 {
michael@0 437 return "Big5";
michael@0 438 }
michael@0 439
michael@0 440 const char *CharsetRecog_big5::getLanguage() const
michael@0 441 {
michael@0 442 return "zh";
michael@0 443 }
michael@0 444
michael@0 445 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
michael@0 446 {
michael@0 447 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
michael@0 448 results->set(det, this, confidence);
michael@0 449 return (confidence > 0);
michael@0 450 }
michael@0 451
michael@0 452 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
michael@0 453 {
michael@0 454 // nothing to do
michael@0 455 }
michael@0 456
michael@0 457 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
michael@0 458 int32_t firstByte = 0;
michael@0 459 int32_t secondByte = 0;
michael@0 460 int32_t thirdByte = 0;
michael@0 461 int32_t fourthByte = 0;
michael@0 462
michael@0 463 it->index = it->nextIndex;
michael@0 464 it->error = FALSE;
michael@0 465 firstByte = it->charValue = it->nextByte(det);
michael@0 466
michael@0 467 if (firstByte < 0) {
michael@0 468 // Ran off the end of the input data
michael@0 469 return FALSE;
michael@0 470 }
michael@0 471
michael@0 472 if (firstByte <= 0x80) {
michael@0 473 // single byte char
michael@0 474 return TRUE;
michael@0 475 }
michael@0 476
michael@0 477 secondByte = it->nextByte(det);
michael@0 478 if (secondByte >= 0) {
michael@0 479 it->charValue = (it->charValue << 8) | secondByte;
michael@0 480 }
michael@0 481 // else we'll handle the error later.
michael@0 482
michael@0 483 if (firstByte >= 0x81 && firstByte <= 0xFE) {
michael@0 484 // Two byte Char
michael@0 485 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
michael@0 486 return TRUE;
michael@0 487 }
michael@0 488
michael@0 489 // Four byte char
michael@0 490 if (secondByte >= 0x30 && secondByte <= 0x39) {
michael@0 491 thirdByte = it->nextByte(det);
michael@0 492
michael@0 493 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
michael@0 494 fourthByte = it->nextByte(det);
michael@0 495
michael@0 496 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
michael@0 497 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
michael@0 498
michael@0 499 return TRUE;
michael@0 500 }
michael@0 501 }
michael@0 502 }
michael@0 503
michael@0 504 // Something wasn't valid, or we ran out of data (-1).
michael@0 505 it->error = TRUE;
michael@0 506 }
michael@0 507
michael@0 508 return TRUE;
michael@0 509 }
michael@0 510
michael@0 511 const char *CharsetRecog_gb_18030::getName() const
michael@0 512 {
michael@0 513 return "GB18030";
michael@0 514 }
michael@0 515
michael@0 516 const char *CharsetRecog_gb_18030::getLanguage() const
michael@0 517 {
michael@0 518 return "zh";
michael@0 519 }
michael@0 520
michael@0 521 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
michael@0 522 {
michael@0 523 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
michael@0 524 results->set(det, this, confidence);
michael@0 525 return (confidence > 0);
michael@0 526 }
michael@0 527
michael@0 528 U_NAMESPACE_END
michael@0 529 #endif

mercurial