intl/icu/source/i18n/csdetect.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 2005-2013, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  */
     8 #include "unicode/utypes.h"
    10 #if !UCONFIG_NO_CONVERSION
    12 #include "unicode/ucsdet.h"
    14 #include "csdetect.h"
    15 #include "csmatch.h"
    16 #include "uenumimp.h"
    18 #include "cmemory.h"
    19 #include "cstring.h"
    20 #include "umutex.h"
    21 #include "ucln_in.h"
    22 #include "uarrsort.h"
    23 #include "inputext.h"
    24 #include "csrsbcs.h"
    25 #include "csrmbcs.h"
    26 #include "csrutf8.h"
    27 #include "csrucode.h"
    28 #include "csr2022.h"
    30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
    33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
    35 U_NAMESPACE_BEGIN
    37 struct CSRecognizerInfo : public UMemory {
    38     CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
    39         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
    41     ~CSRecognizerInfo() {delete recognizer;};
    43     CharsetRecognizer *recognizer;
    44     UBool isDefaultEnabled;
    45 };
    47 U_NAMESPACE_END
    49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
    50 static icu::UInitOnce gCSRecognizersInitOnce;
    51 static int32_t fCSRecognizers_size = 0;
    53 U_CDECL_BEGIN
    54 static UBool U_CALLCONV csdet_cleanup(void)
    55 {
    56     U_NAMESPACE_USE
    57     if (fCSRecognizers != NULL) {
    58         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
    59             delete fCSRecognizers[r];
    60             fCSRecognizers[r] = NULL;
    61         }
    63         DELETE_ARRAY(fCSRecognizers);
    64         fCSRecognizers = NULL;
    65         fCSRecognizers_size = 0;
    66     }
    67     gCSRecognizersInitOnce.reset();
    69     return TRUE;
    70 }
    72 static int32_t U_CALLCONV
    73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
    74 {
    75     U_NAMESPACE_USE
    77     const CharsetMatch **csm_l = (const CharsetMatch **) left;
    78     const CharsetMatch **csm_r = (const CharsetMatch **) right;
    80     // NOTE: compare is backwards to sort from highest to lowest.
    81     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
    82 }
    84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
    85     U_NAMESPACE_USE
    86     ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
    87     CSRecognizerInfo *tempArray[] = {
    88         new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
    90         new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
    91         new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
    92         new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
    93         new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
    95         new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
    96         new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
    97         new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
    98         new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
    99         new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
   100         new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
   101         new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
   102         new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
   103         new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
   104         new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
   105         new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
   106         new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
   107         new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
   108         new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
   109         new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
   110         new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
   112         new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
   113         new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
   114         new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
   116         new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
   117         new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
   118         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
   119         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
   120     };
   121     int32_t rCount = ARRAY_SIZE(tempArray);
   123     fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
   125     if (fCSRecognizers == NULL) {
   126         status = U_MEMORY_ALLOCATION_ERROR;
   127     } 
   128     else {
   129         fCSRecognizers_size = rCount;
   130         for (int32_t r = 0; r < rCount; r += 1) {
   131             fCSRecognizers[r] = tempArray[r];
   132             if (fCSRecognizers[r] == NULL) {
   133                 status = U_MEMORY_ALLOCATION_ERROR;
   134             }
   135         }
   136     }
   137 }
   139 U_CDECL_END
   141 U_NAMESPACE_BEGIN
   143 void CharsetDetector::setRecognizers(UErrorCode &status)
   144 {
   145     umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
   146 }
   148 CharsetDetector::CharsetDetector(UErrorCode &status)
   149   : textIn(new InputText(status)), resultArray(NULL),
   150     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
   151     fEnabledRecognizers(NULL)
   152 {
   153     if (U_FAILURE(status)) {
   154         return;
   155     }
   157     setRecognizers(status);
   159     if (U_FAILURE(status)) {
   160         return;
   161     }
   163     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
   165     if (resultArray == NULL) {
   166         status = U_MEMORY_ALLOCATION_ERROR;
   167         return;
   168     }
   170     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
   171         resultArray[i] = new CharsetMatch();
   173         if (resultArray[i] == NULL) {
   174             status = U_MEMORY_ALLOCATION_ERROR;
   175             break;
   176         }
   177     }
   178 }
   180 CharsetDetector::~CharsetDetector()
   181 {
   182     delete textIn;
   184     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
   185         delete resultArray[i];
   186     }
   188     uprv_free(resultArray);
   190     if (fEnabledRecognizers) {
   191         uprv_free(fEnabledRecognizers);
   192     }
   193 }
   195 void CharsetDetector::setText(const char *in, int32_t len)
   196 {
   197     textIn->setText(in, len);
   198     fFreshTextSet = TRUE;
   199 }
   201 UBool CharsetDetector::setStripTagsFlag(UBool flag)
   202 {
   203     UBool temp = fStripTags;
   204     fStripTags = flag;
   205     fFreshTextSet = TRUE;
   206     return temp;
   207 }
   209 UBool CharsetDetector::getStripTagsFlag() const
   210 {
   211     return fStripTags;
   212 }
   214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
   215 {
   216     textIn->setDeclaredEncoding(encoding,len);
   217 }
   219 int32_t CharsetDetector::getDetectableCount()
   220 {
   221     UErrorCode status = U_ZERO_ERROR;
   223     setRecognizers(status);
   225     return fCSRecognizers_size; 
   226 }
   228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
   229 {
   230     int32_t maxMatchesFound = 0;
   232     detectAll(maxMatchesFound, status);
   234     if(maxMatchesFound > 0) {
   235         return resultArray[0];
   236     } else {
   237         return NULL;
   238     }
   239 }
   241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
   242 {
   243     if(!textIn->isSet()) {
   244         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
   246         return NULL;
   247     } else if (fFreshTextSet) {
   248         CharsetRecognizer *csr;
   249         int32_t            i;
   251         textIn->MungeInput(fStripTags);
   253         // Iterate over all possible charsets, remember all that
   254         // give a match quality > 0.
   255         resultCount = 0;
   256         for (i = 0; i < fCSRecognizers_size; i += 1) {
   257             csr = fCSRecognizers[i]->recognizer;
   258             if (csr->match(textIn, resultArray[resultCount])) {
   259                 resultCount++;
   260             }
   261         }
   263         if (resultCount > 1) {
   264             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
   265         }
   266         fFreshTextSet = FALSE;
   267     }
   269     maxMatchesFound = resultCount;
   271     return resultArray;
   272 }
   274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
   275 {
   276     if (U_FAILURE(status)) {
   277         return;
   278     }
   280     int32_t modIdx = -1;
   281     UBool isDefaultVal = FALSE;
   282     for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   283         CSRecognizerInfo *csrinfo = fCSRecognizers[i];
   284         if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
   285             modIdx = i;
   286             isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
   287             break;
   288         }
   289     }
   290     if (modIdx < 0) {
   291         // No matching encoding found
   292         status = U_ILLEGAL_ARGUMENT_ERROR;
   293         return;
   294     }
   296     if (fEnabledRecognizers == NULL && !isDefaultVal) {
   297         // Create an array storing the non default setting
   298         fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
   299         if (fEnabledRecognizers == NULL) {
   300             status = U_MEMORY_ALLOCATION_ERROR;
   301             return;
   302         }
   303         // Initialize the array with default info
   304         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   305             fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
   306         }
   307     }
   309     if (fEnabledRecognizers != NULL) {
   310         fEnabledRecognizers[modIdx] = enabled;
   311     }
   312 }
   314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
   315 {
   316     if( index > fCSRecognizers_size-1 || index < 0) {
   317         status = U_INDEX_OUTOFBOUNDS_ERROR;
   319         return 0;
   320     } else {
   321         return fCSRecognizers[index]->getName();
   322     }
   323 }*/
   325 U_NAMESPACE_END
   327 U_CDECL_BEGIN
   328 typedef struct {
   329     int32_t currIndex;
   330     UBool all;
   331     UBool *enabledRecognizers;
   332 } Context;
   336 static void U_CALLCONV
   337 enumClose(UEnumeration *en) {
   338     if(en->context != NULL) {
   339         DELETE_ARRAY(en->context);
   340     }
   342     DELETE_ARRAY(en);
   343 }
   345 static int32_t U_CALLCONV
   346 enumCount(UEnumeration *en, UErrorCode *) {
   347     if (((Context *)en->context)->all) {
   348         // ucsdet_getAllDetectableCharsets, all charset detector names
   349         return fCSRecognizers_size;
   350     }
   352     // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
   353     int32_t count = 0;
   354     UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
   355     if (enabledArray != NULL) {
   356         // custom set
   357         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   358             if (enabledArray[i]) {
   359                 count++;
   360             }
   361         }
   362     } else {
   363         // default set
   364         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   365             if (fCSRecognizers[i]->isDefaultEnabled) {
   366                 count++;
   367             }
   368         }
   369     }
   370     return count;
   371 }
   373 static const char* U_CALLCONV
   374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
   375     const char *currName = NULL;
   377     if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
   378         if (((Context *)en->context)->all) {
   379             // ucsdet_getAllDetectableCharsets, all charset detector names
   380             currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   381             ((Context *)en->context)->currIndex++;
   382         } else {
   383             // ucsdet_getDetectableCharsets
   384             UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
   385             if (enabledArray != NULL) {
   386                 // custome set
   387                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
   388                     if (enabledArray[((Context *)en->context)->currIndex]) {
   389                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   390                     }
   391                     ((Context *)en->context)->currIndex++;
   392                 }
   393             } else {
   394                 // default set
   395                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
   396                     if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
   397                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   398                     }
   399                     ((Context *)en->context)->currIndex++;
   400                 }
   401             }
   402         }
   403     }
   405     if(resultLength != NULL) {
   406         *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
   407     }
   409     return currName;
   410 }
   413 static void U_CALLCONV
   414 enumReset(UEnumeration *en, UErrorCode *) {
   415     ((Context *)en->context)->currIndex = 0;
   416 }
   418 static const UEnumeration gCSDetEnumeration = {
   419     NULL,
   420     NULL,
   421     enumClose,
   422     enumCount,
   423     uenum_unextDefault,
   424     enumNext,
   425     enumReset
   426 };
   428 U_CDECL_END
   430 U_NAMESPACE_BEGIN
   432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
   433 {
   435     /* Initialize recognized charsets. */
   436     setRecognizers(status);
   438     if(U_FAILURE(status)) {
   439         return 0;
   440     }
   442     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
   443     if (en == NULL) {
   444         status = U_MEMORY_ALLOCATION_ERROR;
   445         return 0;
   446     }
   447     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
   448     en->context = (void*)NEW_ARRAY(Context, 1);
   449     if (en->context == NULL) {
   450         status = U_MEMORY_ALLOCATION_ERROR;
   451         DELETE_ARRAY(en);
   452         return 0;
   453     }
   454     uprv_memset(en->context, 0, sizeof(Context));
   455     ((Context*)en->context)->all = TRUE;
   456     return en;
   457 }
   459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
   460 {
   461     if(U_FAILURE(status)) {
   462         return 0;
   463     }
   465     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
   466     if (en == NULL) {
   467         status = U_MEMORY_ALLOCATION_ERROR;
   468         return 0;
   469     }
   470     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
   471     en->context = (void*)NEW_ARRAY(Context, 1);
   472     if (en->context == NULL) {
   473         status = U_MEMORY_ALLOCATION_ERROR;
   474         DELETE_ARRAY(en);
   475         return 0;
   476     }
   477     uprv_memset(en->context, 0, sizeof(Context));
   478     ((Context*)en->context)->all = FALSE;
   479     ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
   480     return en;
   481 }
   483 U_NAMESPACE_END
   485 #endif

mercurial