intl/icu/source/i18n/csdetect.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_CONVERSION
michael@0 11
michael@0 12 #include "unicode/ucsdet.h"
michael@0 13
michael@0 14 #include "csdetect.h"
michael@0 15 #include "csmatch.h"
michael@0 16 #include "uenumimp.h"
michael@0 17
michael@0 18 #include "cmemory.h"
michael@0 19 #include "cstring.h"
michael@0 20 #include "umutex.h"
michael@0 21 #include "ucln_in.h"
michael@0 22 #include "uarrsort.h"
michael@0 23 #include "inputext.h"
michael@0 24 #include "csrsbcs.h"
michael@0 25 #include "csrmbcs.h"
michael@0 26 #include "csrutf8.h"
michael@0 27 #include "csrucode.h"
michael@0 28 #include "csr2022.h"
michael@0 29
michael@0 30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
michael@0 31
michael@0 32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
michael@0 33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
michael@0 34
michael@0 35 U_NAMESPACE_BEGIN
michael@0 36
michael@0 37 struct CSRecognizerInfo : public UMemory {
michael@0 38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
michael@0 39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
michael@0 40
michael@0 41 ~CSRecognizerInfo() {delete recognizer;};
michael@0 42
michael@0 43 CharsetRecognizer *recognizer;
michael@0 44 UBool isDefaultEnabled;
michael@0 45 };
michael@0 46
michael@0 47 U_NAMESPACE_END
michael@0 48
michael@0 49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
michael@0 50 static icu::UInitOnce gCSRecognizersInitOnce;
michael@0 51 static int32_t fCSRecognizers_size = 0;
michael@0 52
michael@0 53 U_CDECL_BEGIN
michael@0 54 static UBool U_CALLCONV csdet_cleanup(void)
michael@0 55 {
michael@0 56 U_NAMESPACE_USE
michael@0 57 if (fCSRecognizers != NULL) {
michael@0 58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
michael@0 59 delete fCSRecognizers[r];
michael@0 60 fCSRecognizers[r] = NULL;
michael@0 61 }
michael@0 62
michael@0 63 DELETE_ARRAY(fCSRecognizers);
michael@0 64 fCSRecognizers = NULL;
michael@0 65 fCSRecognizers_size = 0;
michael@0 66 }
michael@0 67 gCSRecognizersInitOnce.reset();
michael@0 68
michael@0 69 return TRUE;
michael@0 70 }
michael@0 71
michael@0 72 static int32_t U_CALLCONV
michael@0 73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
michael@0 74 {
michael@0 75 U_NAMESPACE_USE
michael@0 76
michael@0 77 const CharsetMatch **csm_l = (const CharsetMatch **) left;
michael@0 78 const CharsetMatch **csm_r = (const CharsetMatch **) right;
michael@0 79
michael@0 80 // NOTE: compare is backwards to sort from highest to lowest.
michael@0 81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
michael@0 82 }
michael@0 83
michael@0 84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
michael@0 85 U_NAMESPACE_USE
michael@0 86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
michael@0 87 CSRecognizerInfo *tempArray[] = {
michael@0 88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
michael@0 89
michael@0 90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
michael@0 91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
michael@0 92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
michael@0 93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
michael@0 94
michael@0 95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
michael@0 96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
michael@0 97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
michael@0 98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
michael@0 99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
michael@0 100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
michael@0 101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
michael@0 102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
michael@0 103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
michael@0 104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
michael@0 105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
michael@0 106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
michael@0 107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
michael@0 108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
michael@0 109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
michael@0 110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
michael@0 111
michael@0 112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
michael@0 113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
michael@0 114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
michael@0 115
michael@0 116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
michael@0 117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
michael@0 118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
michael@0 119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
michael@0 120 };
michael@0 121 int32_t rCount = ARRAY_SIZE(tempArray);
michael@0 122
michael@0 123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
michael@0 124
michael@0 125 if (fCSRecognizers == NULL) {
michael@0 126 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 127 }
michael@0 128 else {
michael@0 129 fCSRecognizers_size = rCount;
michael@0 130 for (int32_t r = 0; r < rCount; r += 1) {
michael@0 131 fCSRecognizers[r] = tempArray[r];
michael@0 132 if (fCSRecognizers[r] == NULL) {
michael@0 133 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 134 }
michael@0 135 }
michael@0 136 }
michael@0 137 }
michael@0 138
michael@0 139 U_CDECL_END
michael@0 140
michael@0 141 U_NAMESPACE_BEGIN
michael@0 142
michael@0 143 void CharsetDetector::setRecognizers(UErrorCode &status)
michael@0 144 {
michael@0 145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
michael@0 146 }
michael@0 147
michael@0 148 CharsetDetector::CharsetDetector(UErrorCode &status)
michael@0 149 : textIn(new InputText(status)), resultArray(NULL),
michael@0 150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
michael@0 151 fEnabledRecognizers(NULL)
michael@0 152 {
michael@0 153 if (U_FAILURE(status)) {
michael@0 154 return;
michael@0 155 }
michael@0 156
michael@0 157 setRecognizers(status);
michael@0 158
michael@0 159 if (U_FAILURE(status)) {
michael@0 160 return;
michael@0 161 }
michael@0 162
michael@0 163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
michael@0 164
michael@0 165 if (resultArray == NULL) {
michael@0 166 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 167 return;
michael@0 168 }
michael@0 169
michael@0 170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
michael@0 171 resultArray[i] = new CharsetMatch();
michael@0 172
michael@0 173 if (resultArray[i] == NULL) {
michael@0 174 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 175 break;
michael@0 176 }
michael@0 177 }
michael@0 178 }
michael@0 179
michael@0 180 CharsetDetector::~CharsetDetector()
michael@0 181 {
michael@0 182 delete textIn;
michael@0 183
michael@0 184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
michael@0 185 delete resultArray[i];
michael@0 186 }
michael@0 187
michael@0 188 uprv_free(resultArray);
michael@0 189
michael@0 190 if (fEnabledRecognizers) {
michael@0 191 uprv_free(fEnabledRecognizers);
michael@0 192 }
michael@0 193 }
michael@0 194
michael@0 195 void CharsetDetector::setText(const char *in, int32_t len)
michael@0 196 {
michael@0 197 textIn->setText(in, len);
michael@0 198 fFreshTextSet = TRUE;
michael@0 199 }
michael@0 200
michael@0 201 UBool CharsetDetector::setStripTagsFlag(UBool flag)
michael@0 202 {
michael@0 203 UBool temp = fStripTags;
michael@0 204 fStripTags = flag;
michael@0 205 fFreshTextSet = TRUE;
michael@0 206 return temp;
michael@0 207 }
michael@0 208
michael@0 209 UBool CharsetDetector::getStripTagsFlag() const
michael@0 210 {
michael@0 211 return fStripTags;
michael@0 212 }
michael@0 213
michael@0 214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
michael@0 215 {
michael@0 216 textIn->setDeclaredEncoding(encoding,len);
michael@0 217 }
michael@0 218
michael@0 219 int32_t CharsetDetector::getDetectableCount()
michael@0 220 {
michael@0 221 UErrorCode status = U_ZERO_ERROR;
michael@0 222
michael@0 223 setRecognizers(status);
michael@0 224
michael@0 225 return fCSRecognizers_size;
michael@0 226 }
michael@0 227
michael@0 228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
michael@0 229 {
michael@0 230 int32_t maxMatchesFound = 0;
michael@0 231
michael@0 232 detectAll(maxMatchesFound, status);
michael@0 233
michael@0 234 if(maxMatchesFound > 0) {
michael@0 235 return resultArray[0];
michael@0 236 } else {
michael@0 237 return NULL;
michael@0 238 }
michael@0 239 }
michael@0 240
michael@0 241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
michael@0 242 {
michael@0 243 if(!textIn->isSet()) {
michael@0 244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
michael@0 245
michael@0 246 return NULL;
michael@0 247 } else if (fFreshTextSet) {
michael@0 248 CharsetRecognizer *csr;
michael@0 249 int32_t i;
michael@0 250
michael@0 251 textIn->MungeInput(fStripTags);
michael@0 252
michael@0 253 // Iterate over all possible charsets, remember all that
michael@0 254 // give a match quality > 0.
michael@0 255 resultCount = 0;
michael@0 256 for (i = 0; i < fCSRecognizers_size; i += 1) {
michael@0 257 csr = fCSRecognizers[i]->recognizer;
michael@0 258 if (csr->match(textIn, resultArray[resultCount])) {
michael@0 259 resultCount++;
michael@0 260 }
michael@0 261 }
michael@0 262
michael@0 263 if (resultCount > 1) {
michael@0 264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
michael@0 265 }
michael@0 266 fFreshTextSet = FALSE;
michael@0 267 }
michael@0 268
michael@0 269 maxMatchesFound = resultCount;
michael@0 270
michael@0 271 return resultArray;
michael@0 272 }
michael@0 273
michael@0 274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
michael@0 275 {
michael@0 276 if (U_FAILURE(status)) {
michael@0 277 return;
michael@0 278 }
michael@0 279
michael@0 280 int32_t modIdx = -1;
michael@0 281 UBool isDefaultVal = FALSE;
michael@0 282 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
michael@0 283 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
michael@0 284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
michael@0 285 modIdx = i;
michael@0 286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
michael@0 287 break;
michael@0 288 }
michael@0 289 }
michael@0 290 if (modIdx < 0) {
michael@0 291 // No matching encoding found
michael@0 292 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 293 return;
michael@0 294 }
michael@0 295
michael@0 296 if (fEnabledRecognizers == NULL && !isDefaultVal) {
michael@0 297 // Create an array storing the non default setting
michael@0 298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
michael@0 299 if (fEnabledRecognizers == NULL) {
michael@0 300 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 301 return;
michael@0 302 }
michael@0 303 // Initialize the array with default info
michael@0 304 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
michael@0 305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
michael@0 306 }
michael@0 307 }
michael@0 308
michael@0 309 if (fEnabledRecognizers != NULL) {
michael@0 310 fEnabledRecognizers[modIdx] = enabled;
michael@0 311 }
michael@0 312 }
michael@0 313
michael@0 314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
michael@0 315 {
michael@0 316 if( index > fCSRecognizers_size-1 || index < 0) {
michael@0 317 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 318
michael@0 319 return 0;
michael@0 320 } else {
michael@0 321 return fCSRecognizers[index]->getName();
michael@0 322 }
michael@0 323 }*/
michael@0 324
michael@0 325 U_NAMESPACE_END
michael@0 326
michael@0 327 U_CDECL_BEGIN
michael@0 328 typedef struct {
michael@0 329 int32_t currIndex;
michael@0 330 UBool all;
michael@0 331 UBool *enabledRecognizers;
michael@0 332 } Context;
michael@0 333
michael@0 334
michael@0 335
michael@0 336 static void U_CALLCONV
michael@0 337 enumClose(UEnumeration *en) {
michael@0 338 if(en->context != NULL) {
michael@0 339 DELETE_ARRAY(en->context);
michael@0 340 }
michael@0 341
michael@0 342 DELETE_ARRAY(en);
michael@0 343 }
michael@0 344
michael@0 345 static int32_t U_CALLCONV
michael@0 346 enumCount(UEnumeration *en, UErrorCode *) {
michael@0 347 if (((Context *)en->context)->all) {
michael@0 348 // ucsdet_getAllDetectableCharsets, all charset detector names
michael@0 349 return fCSRecognizers_size;
michael@0 350 }
michael@0 351
michael@0 352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
michael@0 353 int32_t count = 0;
michael@0 354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
michael@0 355 if (enabledArray != NULL) {
michael@0 356 // custom set
michael@0 357 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
michael@0 358 if (enabledArray[i]) {
michael@0 359 count++;
michael@0 360 }
michael@0 361 }
michael@0 362 } else {
michael@0 363 // default set
michael@0 364 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
michael@0 365 if (fCSRecognizers[i]->isDefaultEnabled) {
michael@0 366 count++;
michael@0 367 }
michael@0 368 }
michael@0 369 }
michael@0 370 return count;
michael@0 371 }
michael@0 372
michael@0 373 static const char* U_CALLCONV
michael@0 374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
michael@0 375 const char *currName = NULL;
michael@0 376
michael@0 377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
michael@0 378 if (((Context *)en->context)->all) {
michael@0 379 // ucsdet_getAllDetectableCharsets, all charset detector names
michael@0 380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
michael@0 381 ((Context *)en->context)->currIndex++;
michael@0 382 } else {
michael@0 383 // ucsdet_getDetectableCharsets
michael@0 384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
michael@0 385 if (enabledArray != NULL) {
michael@0 386 // custome set
michael@0 387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
michael@0 388 if (enabledArray[((Context *)en->context)->currIndex]) {
michael@0 389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
michael@0 390 }
michael@0 391 ((Context *)en->context)->currIndex++;
michael@0 392 }
michael@0 393 } else {
michael@0 394 // default set
michael@0 395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
michael@0 396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
michael@0 397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
michael@0 398 }
michael@0 399 ((Context *)en->context)->currIndex++;
michael@0 400 }
michael@0 401 }
michael@0 402 }
michael@0 403 }
michael@0 404
michael@0 405 if(resultLength != NULL) {
michael@0 406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
michael@0 407 }
michael@0 408
michael@0 409 return currName;
michael@0 410 }
michael@0 411
michael@0 412
michael@0 413 static void U_CALLCONV
michael@0 414 enumReset(UEnumeration *en, UErrorCode *) {
michael@0 415 ((Context *)en->context)->currIndex = 0;
michael@0 416 }
michael@0 417
michael@0 418 static const UEnumeration gCSDetEnumeration = {
michael@0 419 NULL,
michael@0 420 NULL,
michael@0 421 enumClose,
michael@0 422 enumCount,
michael@0 423 uenum_unextDefault,
michael@0 424 enumNext,
michael@0 425 enumReset
michael@0 426 };
michael@0 427
michael@0 428 U_CDECL_END
michael@0 429
michael@0 430 U_NAMESPACE_BEGIN
michael@0 431
michael@0 432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
michael@0 433 {
michael@0 434
michael@0 435 /* Initialize recognized charsets. */
michael@0 436 setRecognizers(status);
michael@0 437
michael@0 438 if(U_FAILURE(status)) {
michael@0 439 return 0;
michael@0 440 }
michael@0 441
michael@0 442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
michael@0 443 if (en == NULL) {
michael@0 444 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 445 return 0;
michael@0 446 }
michael@0 447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
michael@0 448 en->context = (void*)NEW_ARRAY(Context, 1);
michael@0 449 if (en->context == NULL) {
michael@0 450 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 451 DELETE_ARRAY(en);
michael@0 452 return 0;
michael@0 453 }
michael@0 454 uprv_memset(en->context, 0, sizeof(Context));
michael@0 455 ((Context*)en->context)->all = TRUE;
michael@0 456 return en;
michael@0 457 }
michael@0 458
michael@0 459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
michael@0 460 {
michael@0 461 if(U_FAILURE(status)) {
michael@0 462 return 0;
michael@0 463 }
michael@0 464
michael@0 465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
michael@0 466 if (en == NULL) {
michael@0 467 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 468 return 0;
michael@0 469 }
michael@0 470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
michael@0 471 en->context = (void*)NEW_ARRAY(Context, 1);
michael@0 472 if (en->context == NULL) {
michael@0 473 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 474 DELETE_ARRAY(en);
michael@0 475 return 0;
michael@0 476 }
michael@0 477 uprv_memset(en->context, 0, sizeof(Context));
michael@0 478 ((Context*)en->context)->all = FALSE;
michael@0 479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
michael@0 480 return en;
michael@0 481 }
michael@0 482
michael@0 483 U_NAMESPACE_END
michael@0 484
michael@0 485 #endif

mercurial