intl/icu/source/i18n/csrmbcs.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 2005-2012, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  */
     8 #include "unicode/utypes.h"
    10 #if !UCONFIG_NO_CONVERSION
    12 #include "csmatch.h"
    13 #include "csrmbcs.h"
    15 #include <math.h>
    17 U_NAMESPACE_BEGIN
    19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    21 #define min(x,y) (((x)<(y))?(x):(y))
    23 static const uint16_t commonChars_sjis [] = {
    24 // TODO:  This set of data comes from the character frequency-
    25 //        of-occurence analysis tool.  The data needs to be moved
    26 //        into a resource and loaded from there.
    27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
    28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
    29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
    30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
    31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
    32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
    34 static const uint16_t commonChars_euc_jp[] = {
    35 // TODO:  This set of data comes from the character frequency-
    36 //        of-occurence analysis tool.  The data needs to be moved
    37 //        into a resource and loaded from there.
    38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
    39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
    40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
    41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
    42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
    43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
    44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
    45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
    46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
    47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
    49 static const uint16_t commonChars_euc_kr[] = {
    50 // TODO:  This set of data comes from the character frequency-
    51 //        of-occurence analysis tool.  The data needs to be moved
    52 //        into a resource and loaded from there.
    53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
    54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
    55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
    56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
    57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
    58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
    59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
    60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
    61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
    62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
    64 static const uint16_t commonChars_big5[] = {
    65 // TODO:  This set of data comes from the character frequency-
    66 //        of-occurence analysis tool.  The data needs to be moved
    67 //        into a resource and loaded from there.
    68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
    69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
    70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
    71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
    72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
    73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
    74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
    75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
    76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
    77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
    79 static const uint16_t commonChars_gb_18030[] = {
    80 // TODO:  This set of data comes from the character frequency-
    81 //        of-occurence analysis tool.  The data needs to be moved
    82 //        into a resource and loaded from there.
    83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
    84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
    85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
    86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
    87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
    88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
    89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
    90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
    91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
    92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
    94 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
    95 {
    96     int32_t start = 0, end = len-1;
    97     int32_t mid = (start+end)/2;
    99     while(start <= end) {
   100         if(array[mid] == value) {
   101             return mid;
   102         }
   104         if(array[mid] < value){
   105             start = mid+1;
   106         } else {
   107             end = mid-1;
   108         }
   110         mid = (start+end)/2;
   111     }
   113     return -1;
   114 }
   116 IteratedChar::IteratedChar() : 
   117 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
   118 {
   119     // nothing else to do.
   120 }
   122 /*void IteratedChar::reset()
   123 {
   124     charValue = 0;
   125     index     = -1;
   126     nextIndex = 0;
   127     error     = FALSE;
   128     done      = FALSE;
   129 }*/
   131 int32_t IteratedChar::nextByte(InputText *det)
   132 {
   133     if (nextIndex >= det->fRawLength) {
   134         done = TRUE;
   136         return -1;
   137     }
   139     return det->fRawInput[nextIndex++];
   140 }
   142 CharsetRecog_mbcs::~CharsetRecog_mbcs()
   143 {
   144     // nothing to do.
   145 }
   147 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
   148     int32_t singleByteCharCount = 0;
   149     int32_t doubleByteCharCount = 0;
   150     int32_t commonCharCount     = 0;
   151     int32_t badCharCount        = 0;
   152     int32_t totalCharCount      = 0;
   153     int32_t confidence          = 0;
   154     IteratedChar iter;
   156     while (nextChar(&iter, det)) {
   157         totalCharCount++;
   159         if (iter.error) {
   160             badCharCount++;
   161         } else {
   162             if (iter.charValue <= 0xFF) {
   163                 singleByteCharCount++;
   164             } else {
   165                 doubleByteCharCount++;
   167                 if (commonChars != 0) {
   168                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
   169                         commonCharCount += 1;
   170                     }
   171                 }
   172             }
   173         }
   176         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
   177             // Bail out early if the byte data is not matching the encoding scheme.
   178             // break detectBlock;
   179             return confidence;
   180         }
   181     }
   183     if (doubleByteCharCount <= 10 && badCharCount == 0) {
   184         // Not many multi-byte chars.
   185         if (doubleByteCharCount == 0 && totalCharCount < 10) {
   186             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
   187             // We don't have enough data to have any confidence.
   188             // Statistical analysis of single byte non-ASCII charcters would probably help here.
   189             confidence = 0;
   190         }
   191         else {
   192             //   ASCII or ISO file?  It's probably not our encoding,
   193             //   but is not incompatible with our encoding, so don't give it a zero.
   194             confidence = 10;
   195         }
   197         return confidence;
   198     }
   200     //
   201     //  No match if there are too many characters that don't fit the encoding scheme.
   202     //    (should we have zero tolerance for these?)
   203     //
   204     if (doubleByteCharCount < 20*badCharCount) {
   205         confidence = 0;
   207         return confidence;
   208     }
   210     if (commonChars == 0) {
   211         // We have no statistics on frequently occuring characters.
   212         //  Assess confidence purely on having a reasonable number of
   213         //  multi-byte characters (the more the better)
   214         confidence = 30 + doubleByteCharCount - 20*badCharCount;
   216         if (confidence > 100) {
   217             confidence = 100;
   218         }
   219     } else {
   220         //
   221         // Frequency of occurence statistics exist.
   222         //
   224         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
   225         double scaleFactor = 90.0 / maxVal;
   226         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
   228         confidence = min(confidence, 100);
   229     }
   231     if (confidence < 0) {
   232         confidence = 0;
   233     }
   235     return confidence;
   236 }
   238 CharsetRecog_sjis::~CharsetRecog_sjis()
   239 {
   240     // nothing to do
   241 }
   243 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
   244     it->index = it->nextIndex;
   245     it->error = FALSE;
   247     int32_t firstByte = it->charValue = it->nextByte(det);
   249     if (firstByte < 0) {
   250         return FALSE;
   251     }
   253     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
   254         return TRUE;
   255     }
   257     int32_t secondByte = it->nextByte(det);
   258     if (secondByte >= 0) {
   259         it->charValue = (firstByte << 8) | secondByte;
   260     }
   261     // else we'll handle the error later.
   263     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
   264         // Illegal second byte value.
   265         it->error = TRUE;
   266     }
   268     return TRUE;
   269 }
   271 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
   272     int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
   273     results->set(det, this, confidence);
   274     return (confidence > 0);
   275 }
   277 const char *CharsetRecog_sjis::getName() const
   278 {
   279     return "Shift_JIS";
   280 }
   282 const char *CharsetRecog_sjis::getLanguage() const
   283 {
   284     return "ja";
   285 }
   287 CharsetRecog_euc::~CharsetRecog_euc()
   288 {
   289     // nothing to do
   290 }
   292 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
   293     int32_t firstByte  = 0;
   294     int32_t secondByte = 0;
   295     int32_t thirdByte  = 0;
   297     it->index = it->nextIndex;
   298     it->error = FALSE;
   299     firstByte = it->charValue = it->nextByte(det);
   301     if (firstByte < 0) {
   302         // Ran off the end of the input data
   303         return FALSE;
   304     }
   306     if (firstByte <= 0x8D) {
   307         // single byte char
   308         return TRUE;
   309     }
   311     secondByte = it->nextByte(det);
   312     if (secondByte >= 0) {
   313         it->charValue = (it->charValue << 8) | secondByte;
   314     }
   315     // else we'll handle the error later.
   317     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
   318         // Two byte Char
   319         if (secondByte < 0xA1) {
   320             it->error = TRUE;
   321         }
   323         return TRUE;
   324     }
   326     if (firstByte == 0x8E) {
   327         // Code Set 2.
   328         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
   329         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
   330         // We don't know which we've got.
   331         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
   332         //   bytes will look like a well formed 2 byte char.
   333         if (secondByte < 0xA1) {
   334             it->error = TRUE;
   335         }
   337         return TRUE;
   338     }
   340     if (firstByte == 0x8F) {
   341         // Code set 3.
   342         // Three byte total char size, two bytes of actual char value.
   343         thirdByte    = it->nextByte(det);
   344         it->charValue = (it->charValue << 8) | thirdByte;
   346         if (thirdByte < 0xa1) {
   347             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
   348             it->error = TRUE;
   349         }
   350     }
   352     return TRUE;
   354 }
   356 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
   357 {
   358     // nothing to do
   359 }
   361 const char *CharsetRecog_euc_jp::getName() const
   362 {
   363     return "EUC-JP";
   364 }
   366 const char *CharsetRecog_euc_jp::getLanguage() const
   367 {
   368     return "ja";
   369 }
   371 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
   372 {
   373     int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
   374     results->set(det, this, confidence);
   375     return (confidence > 0);
   376 }
   378 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
   379 {
   380     // nothing to do
   381 }
   383 const char *CharsetRecog_euc_kr::getName() const
   384 {
   385     return "EUC-KR";
   386 }
   388 const char *CharsetRecog_euc_kr::getLanguage() const
   389 {
   390     return "ko";
   391 }
   393 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
   394 {
   395     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
   396     results->set(det, this, confidence);
   397     return (confidence > 0);
   398 }
   400 CharsetRecog_big5::~CharsetRecog_big5()
   401 {
   402     // nothing to do
   403 }
   405 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
   406 {
   407     int32_t firstByte;
   409     it->index = it->nextIndex;
   410     it->error = FALSE;
   411     firstByte = it->charValue = it->nextByte(det);
   413     if (firstByte < 0) {
   414         return FALSE;
   415     }
   417     if (firstByte <= 0x7F || firstByte == 0xFF) {
   418         // single byte character.
   419         return TRUE;
   420     }
   422     int32_t secondByte = it->nextByte(det);
   423     if (secondByte >= 0)  {
   424         it->charValue = (it->charValue << 8) | secondByte;
   425     }
   426     // else we'll handle the error later.
   428     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
   429         it->error = TRUE;
   430     }
   432     return TRUE;
   433 }
   435 const char *CharsetRecog_big5::getName() const
   436 {
   437     return "Big5";
   438 }
   440 const char *CharsetRecog_big5::getLanguage() const
   441 {
   442     return "zh";
   443 }
   445 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
   446 {
   447     int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
   448     results->set(det, this, confidence);
   449     return (confidence > 0);
   450 }
   452 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
   453 {
   454     // nothing to do
   455 }
   457 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
   458     int32_t firstByte  = 0;
   459     int32_t secondByte = 0;
   460     int32_t thirdByte  = 0;
   461     int32_t fourthByte = 0;
   463     it->index = it->nextIndex;
   464     it->error = FALSE;
   465     firstByte = it->charValue = it->nextByte(det);
   467     if (firstByte < 0) {
   468         // Ran off the end of the input data
   469         return FALSE;
   470     }
   472     if (firstByte <= 0x80) {
   473         // single byte char
   474         return TRUE;
   475     }
   477     secondByte = it->nextByte(det);
   478     if (secondByte >= 0) {
   479         it->charValue = (it->charValue << 8) | secondByte;
   480     }
   481     // else we'll handle the error later.
   483     if (firstByte >= 0x81 && firstByte <= 0xFE) {
   484         // Two byte Char
   485         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
   486             return TRUE;
   487         }
   489         // Four byte char
   490         if (secondByte >= 0x30 && secondByte <= 0x39) {
   491             thirdByte = it->nextByte(det);
   493             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
   494                 fourthByte = it->nextByte(det);
   496                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
   497                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
   499                     return TRUE;
   500                 }
   501             }
   502         }
   504         // Something wasn't valid, or we ran out of data (-1).
   505         it->error = TRUE;
   506     }
   508     return TRUE;
   509 }
   511 const char *CharsetRecog_gb_18030::getName() const
   512 {
   513     return "GB18030";
   514 }
   516 const char *CharsetRecog_gb_18030::getLanguage() const
   517 {
   518     return "zh";
   519 }
   521 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
   522 {
   523     int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
   524     results->set(det, this, confidence);
   525     return (confidence > 0);
   526 }
   528 U_NAMESPACE_END
   529 #endif

mercurial