intl/icu/source/i18n/csdetect.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/csdetect.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,485 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2013, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_CONVERSION
    1.14 +
    1.15 +#include "unicode/ucsdet.h"
    1.16 +
    1.17 +#include "csdetect.h"
    1.18 +#include "csmatch.h"
    1.19 +#include "uenumimp.h"
    1.20 +
    1.21 +#include "cmemory.h"
    1.22 +#include "cstring.h"
    1.23 +#include "umutex.h"
    1.24 +#include "ucln_in.h"
    1.25 +#include "uarrsort.h"
    1.26 +#include "inputext.h"
    1.27 +#include "csrsbcs.h"
    1.28 +#include "csrmbcs.h"
    1.29 +#include "csrutf8.h"
    1.30 +#include "csrucode.h"
    1.31 +#include "csr2022.h"
    1.32 +
    1.33 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    1.34 +
    1.35 +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
    1.36 +#define DELETE_ARRAY(array) uprv_free((void *) (array))
    1.37 +
    1.38 +U_NAMESPACE_BEGIN
    1.39 +
    1.40 +struct CSRecognizerInfo : public UMemory {
    1.41 +    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
    1.42 +        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
    1.43 +
    1.44 +    ~CSRecognizerInfo() {delete recognizer;};
    1.45 +
    1.46 +    CharsetRecognizer *recognizer;
    1.47 +    UBool isDefaultEnabled;
    1.48 +};
    1.49 +
    1.50 +U_NAMESPACE_END
    1.51 +
    1.52 +static icu::CSRecognizerInfo **fCSRecognizers = NULL;
    1.53 +static icu::UInitOnce gCSRecognizersInitOnce;
    1.54 +static int32_t fCSRecognizers_size = 0;
    1.55 +
    1.56 +U_CDECL_BEGIN
    1.57 +static UBool U_CALLCONV csdet_cleanup(void)
    1.58 +{
    1.59 +    U_NAMESPACE_USE
    1.60 +    if (fCSRecognizers != NULL) {
    1.61 +        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
    1.62 +            delete fCSRecognizers[r];
    1.63 +            fCSRecognizers[r] = NULL;
    1.64 +        }
    1.65 +
    1.66 +        DELETE_ARRAY(fCSRecognizers);
    1.67 +        fCSRecognizers = NULL;
    1.68 +        fCSRecognizers_size = 0;
    1.69 +    }
    1.70 +    gCSRecognizersInitOnce.reset();
    1.71 +
    1.72 +    return TRUE;
    1.73 +}
    1.74 +
    1.75 +static int32_t U_CALLCONV
    1.76 +charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
    1.77 +{
    1.78 +    U_NAMESPACE_USE
    1.79 +
    1.80 +    const CharsetMatch **csm_l = (const CharsetMatch **) left;
    1.81 +    const CharsetMatch **csm_r = (const CharsetMatch **) right;
    1.82 +
    1.83 +    // NOTE: compare is backwards to sort from highest to lowest.
    1.84 +    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
    1.85 +}
    1.86 +
    1.87 +static void U_CALLCONV initRecognizers(UErrorCode &status) {
    1.88 +    U_NAMESPACE_USE
    1.89 +    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
    1.90 +    CSRecognizerInfo *tempArray[] = {
    1.91 +        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
    1.92 +
    1.93 +        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
    1.94 +        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
    1.95 +        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
    1.96 +        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
    1.97 +
    1.98 +        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
    1.99 +        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
   1.100 +        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
   1.101 +        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
   1.102 +        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
   1.103 +        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
   1.104 +        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
   1.105 +        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
   1.106 +        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
   1.107 +        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
   1.108 +        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
   1.109 +        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
   1.110 +        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
   1.111 +        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
   1.112 +        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
   1.113 +        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
   1.114 +
   1.115 +        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
   1.116 +        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
   1.117 +        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
   1.118 +
   1.119 +        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
   1.120 +        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
   1.121 +        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
   1.122 +        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
   1.123 +    };
   1.124 +    int32_t rCount = ARRAY_SIZE(tempArray);
   1.125 +
   1.126 +    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
   1.127 +
   1.128 +    if (fCSRecognizers == NULL) {
   1.129 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.130 +    } 
   1.131 +    else {
   1.132 +        fCSRecognizers_size = rCount;
   1.133 +        for (int32_t r = 0; r < rCount; r += 1) {
   1.134 +            fCSRecognizers[r] = tempArray[r];
   1.135 +            if (fCSRecognizers[r] == NULL) {
   1.136 +                status = U_MEMORY_ALLOCATION_ERROR;
   1.137 +            }
   1.138 +        }
   1.139 +    }
   1.140 +}
   1.141 +
   1.142 +U_CDECL_END
   1.143 +
   1.144 +U_NAMESPACE_BEGIN
   1.145 +
   1.146 +void CharsetDetector::setRecognizers(UErrorCode &status)
   1.147 +{
   1.148 +    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
   1.149 +}
   1.150 +
   1.151 +CharsetDetector::CharsetDetector(UErrorCode &status)
   1.152 +  : textIn(new InputText(status)), resultArray(NULL),
   1.153 +    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
   1.154 +    fEnabledRecognizers(NULL)
   1.155 +{
   1.156 +    if (U_FAILURE(status)) {
   1.157 +        return;
   1.158 +    }
   1.159 +
   1.160 +    setRecognizers(status);
   1.161 +
   1.162 +    if (U_FAILURE(status)) {
   1.163 +        return;
   1.164 +    }
   1.165 +
   1.166 +    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
   1.167 +
   1.168 +    if (resultArray == NULL) {
   1.169 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.170 +        return;
   1.171 +    }
   1.172 +
   1.173 +    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
   1.174 +        resultArray[i] = new CharsetMatch();
   1.175 +
   1.176 +        if (resultArray[i] == NULL) {
   1.177 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.178 +            break;
   1.179 +        }
   1.180 +    }
   1.181 +}
   1.182 +
   1.183 +CharsetDetector::~CharsetDetector()
   1.184 +{
   1.185 +    delete textIn;
   1.186 +
   1.187 +    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
   1.188 +        delete resultArray[i];
   1.189 +    }
   1.190 +
   1.191 +    uprv_free(resultArray);
   1.192 +
   1.193 +    if (fEnabledRecognizers) {
   1.194 +        uprv_free(fEnabledRecognizers);
   1.195 +    }
   1.196 +}
   1.197 +
   1.198 +void CharsetDetector::setText(const char *in, int32_t len)
   1.199 +{
   1.200 +    textIn->setText(in, len);
   1.201 +    fFreshTextSet = TRUE;
   1.202 +}
   1.203 +
   1.204 +UBool CharsetDetector::setStripTagsFlag(UBool flag)
   1.205 +{
   1.206 +    UBool temp = fStripTags;
   1.207 +    fStripTags = flag;
   1.208 +    fFreshTextSet = TRUE;
   1.209 +    return temp;
   1.210 +}
   1.211 +
   1.212 +UBool CharsetDetector::getStripTagsFlag() const
   1.213 +{
   1.214 +    return fStripTags;
   1.215 +}
   1.216 +
   1.217 +void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
   1.218 +{
   1.219 +    textIn->setDeclaredEncoding(encoding,len);
   1.220 +}
   1.221 +
   1.222 +int32_t CharsetDetector::getDetectableCount()
   1.223 +{
   1.224 +    UErrorCode status = U_ZERO_ERROR;
   1.225 +
   1.226 +    setRecognizers(status);
   1.227 +
   1.228 +    return fCSRecognizers_size; 
   1.229 +}
   1.230 +
   1.231 +const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
   1.232 +{
   1.233 +    int32_t maxMatchesFound = 0;
   1.234 +
   1.235 +    detectAll(maxMatchesFound, status);
   1.236 +
   1.237 +    if(maxMatchesFound > 0) {
   1.238 +        return resultArray[0];
   1.239 +    } else {
   1.240 +        return NULL;
   1.241 +    }
   1.242 +}
   1.243 +
   1.244 +const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
   1.245 +{
   1.246 +    if(!textIn->isSet()) {
   1.247 +        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
   1.248 +
   1.249 +        return NULL;
   1.250 +    } else if (fFreshTextSet) {
   1.251 +        CharsetRecognizer *csr;
   1.252 +        int32_t            i;
   1.253 +
   1.254 +        textIn->MungeInput(fStripTags);
   1.255 +
   1.256 +        // Iterate over all possible charsets, remember all that
   1.257 +        // give a match quality > 0.
   1.258 +        resultCount = 0;
   1.259 +        for (i = 0; i < fCSRecognizers_size; i += 1) {
   1.260 +            csr = fCSRecognizers[i]->recognizer;
   1.261 +            if (csr->match(textIn, resultArray[resultCount])) {
   1.262 +                resultCount++;
   1.263 +            }
   1.264 +        }
   1.265 +
   1.266 +        if (resultCount > 1) {
   1.267 +            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
   1.268 +        }
   1.269 +        fFreshTextSet = FALSE;
   1.270 +    }
   1.271 +
   1.272 +    maxMatchesFound = resultCount;
   1.273 +
   1.274 +    return resultArray;
   1.275 +}
   1.276 +
   1.277 +void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
   1.278 +{
   1.279 +    if (U_FAILURE(status)) {
   1.280 +        return;
   1.281 +    }
   1.282 +
   1.283 +    int32_t modIdx = -1;
   1.284 +    UBool isDefaultVal = FALSE;
   1.285 +    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   1.286 +        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
   1.287 +        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
   1.288 +            modIdx = i;
   1.289 +            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
   1.290 +            break;
   1.291 +        }
   1.292 +    }
   1.293 +    if (modIdx < 0) {
   1.294 +        // No matching encoding found
   1.295 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.296 +        return;
   1.297 +    }
   1.298 +
   1.299 +    if (fEnabledRecognizers == NULL && !isDefaultVal) {
   1.300 +        // Create an array storing the non default setting
   1.301 +        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
   1.302 +        if (fEnabledRecognizers == NULL) {
   1.303 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.304 +            return;
   1.305 +        }
   1.306 +        // Initialize the array with default info
   1.307 +        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   1.308 +            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
   1.309 +        }
   1.310 +    }
   1.311 +
   1.312 +    if (fEnabledRecognizers != NULL) {
   1.313 +        fEnabledRecognizers[modIdx] = enabled;
   1.314 +    }
   1.315 +}
   1.316 +
   1.317 +/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
   1.318 +{
   1.319 +    if( index > fCSRecognizers_size-1 || index < 0) {
   1.320 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.321 +
   1.322 +        return 0;
   1.323 +    } else {
   1.324 +        return fCSRecognizers[index]->getName();
   1.325 +    }
   1.326 +}*/
   1.327 +
   1.328 +U_NAMESPACE_END
   1.329 +
   1.330 +U_CDECL_BEGIN
   1.331 +typedef struct {
   1.332 +    int32_t currIndex;
   1.333 +    UBool all;
   1.334 +    UBool *enabledRecognizers;
   1.335 +} Context;
   1.336 +
   1.337 +
   1.338 +
   1.339 +static void U_CALLCONV
   1.340 +enumClose(UEnumeration *en) {
   1.341 +    if(en->context != NULL) {
   1.342 +        DELETE_ARRAY(en->context);
   1.343 +    }
   1.344 +
   1.345 +    DELETE_ARRAY(en);
   1.346 +}
   1.347 +
   1.348 +static int32_t U_CALLCONV
   1.349 +enumCount(UEnumeration *en, UErrorCode *) {
   1.350 +    if (((Context *)en->context)->all) {
   1.351 +        // ucsdet_getAllDetectableCharsets, all charset detector names
   1.352 +        return fCSRecognizers_size;
   1.353 +    }
   1.354 +
   1.355 +    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
   1.356 +    int32_t count = 0;
   1.357 +    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
   1.358 +    if (enabledArray != NULL) {
   1.359 +        // custom set
   1.360 +        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   1.361 +            if (enabledArray[i]) {
   1.362 +                count++;
   1.363 +            }
   1.364 +        }
   1.365 +    } else {
   1.366 +        // default set
   1.367 +        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
   1.368 +            if (fCSRecognizers[i]->isDefaultEnabled) {
   1.369 +                count++;
   1.370 +            }
   1.371 +        }
   1.372 +    }
   1.373 +    return count;
   1.374 +}
   1.375 +
   1.376 +static const char* U_CALLCONV
   1.377 +enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
   1.378 +    const char *currName = NULL;
   1.379 +
   1.380 +    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
   1.381 +        if (((Context *)en->context)->all) {
   1.382 +            // ucsdet_getAllDetectableCharsets, all charset detector names
   1.383 +            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   1.384 +            ((Context *)en->context)->currIndex++;
   1.385 +        } else {
   1.386 +            // ucsdet_getDetectableCharsets
   1.387 +            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
   1.388 +            if (enabledArray != NULL) {
   1.389 +                // custome set
   1.390 +                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
   1.391 +                    if (enabledArray[((Context *)en->context)->currIndex]) {
   1.392 +                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   1.393 +                    }
   1.394 +                    ((Context *)en->context)->currIndex++;
   1.395 +                }
   1.396 +            } else {
   1.397 +                // default set
   1.398 +                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
   1.399 +                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
   1.400 +                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
   1.401 +                    }
   1.402 +                    ((Context *)en->context)->currIndex++;
   1.403 +                }
   1.404 +            }
   1.405 +        }
   1.406 +    }
   1.407 +
   1.408 +    if(resultLength != NULL) {
   1.409 +        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
   1.410 +    }
   1.411 +
   1.412 +    return currName;
   1.413 +}
   1.414 +
   1.415 +
   1.416 +static void U_CALLCONV
   1.417 +enumReset(UEnumeration *en, UErrorCode *) {
   1.418 +    ((Context *)en->context)->currIndex = 0;
   1.419 +}
   1.420 +
   1.421 +static const UEnumeration gCSDetEnumeration = {
   1.422 +    NULL,
   1.423 +    NULL,
   1.424 +    enumClose,
   1.425 +    enumCount,
   1.426 +    uenum_unextDefault,
   1.427 +    enumNext,
   1.428 +    enumReset
   1.429 +};
   1.430 +
   1.431 +U_CDECL_END
   1.432 +
   1.433 +U_NAMESPACE_BEGIN
   1.434 +
   1.435 +UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
   1.436 +{
   1.437 +
   1.438 +    /* Initialize recognized charsets. */
   1.439 +    setRecognizers(status);
   1.440 +
   1.441 +    if(U_FAILURE(status)) {
   1.442 +        return 0;
   1.443 +    }
   1.444 +
   1.445 +    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
   1.446 +    if (en == NULL) {
   1.447 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.448 +        return 0;
   1.449 +    }
   1.450 +    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
   1.451 +    en->context = (void*)NEW_ARRAY(Context, 1);
   1.452 +    if (en->context == NULL) {
   1.453 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.454 +        DELETE_ARRAY(en);
   1.455 +        return 0;
   1.456 +    }
   1.457 +    uprv_memset(en->context, 0, sizeof(Context));
   1.458 +    ((Context*)en->context)->all = TRUE;
   1.459 +    return en;
   1.460 +}
   1.461 +
   1.462 +UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
   1.463 +{
   1.464 +    if(U_FAILURE(status)) {
   1.465 +        return 0;
   1.466 +    }
   1.467 +
   1.468 +    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
   1.469 +    if (en == NULL) {
   1.470 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.471 +        return 0;
   1.472 +    }
   1.473 +    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
   1.474 +    en->context = (void*)NEW_ARRAY(Context, 1);
   1.475 +    if (en->context == NULL) {
   1.476 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.477 +        DELETE_ARRAY(en);
   1.478 +        return 0;
   1.479 +    }
   1.480 +    uprv_memset(en->context, 0, sizeof(Context));
   1.481 +    ((Context*)en->context)->all = FALSE;
   1.482 +    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
   1.483 +    return en;
   1.484 +}
   1.485 +
   1.486 +U_NAMESPACE_END
   1.487 +
   1.488 +#endif

mercurial