1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csdetect.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,485 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2013, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_CONVERSION 1.14 + 1.15 +#include "unicode/ucsdet.h" 1.16 + 1.17 +#include "csdetect.h" 1.18 +#include "csmatch.h" 1.19 +#include "uenumimp.h" 1.20 + 1.21 +#include "cmemory.h" 1.22 +#include "cstring.h" 1.23 +#include "umutex.h" 1.24 +#include "ucln_in.h" 1.25 +#include "uarrsort.h" 1.26 +#include "inputext.h" 1.27 +#include "csrsbcs.h" 1.28 +#include "csrmbcs.h" 1.29 +#include "csrutf8.h" 1.30 +#include "csrucode.h" 1.31 +#include "csr2022.h" 1.32 + 1.33 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 1.34 + 1.35 +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 1.36 +#define DELETE_ARRAY(array) uprv_free((void *) (array)) 1.37 + 1.38 +U_NAMESPACE_BEGIN 1.39 + 1.40 +struct CSRecognizerInfo : public UMemory { 1.41 + CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) 1.42 + : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; 1.43 + 1.44 + ~CSRecognizerInfo() {delete recognizer;}; 1.45 + 1.46 + CharsetRecognizer *recognizer; 1.47 + UBool isDefaultEnabled; 1.48 +}; 1.49 + 1.50 +U_NAMESPACE_END 1.51 + 1.52 +static icu::CSRecognizerInfo **fCSRecognizers = NULL; 1.53 +static icu::UInitOnce gCSRecognizersInitOnce; 1.54 +static int32_t fCSRecognizers_size = 0; 1.55 + 1.56 +U_CDECL_BEGIN 1.57 +static UBool U_CALLCONV csdet_cleanup(void) 1.58 +{ 1.59 + U_NAMESPACE_USE 1.60 + if (fCSRecognizers != NULL) { 1.61 + for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 1.62 + delete fCSRecognizers[r]; 1.63 + fCSRecognizers[r] = NULL; 1.64 + } 1.65 + 1.66 + DELETE_ARRAY(fCSRecognizers); 1.67 + fCSRecognizers = NULL; 1.68 + fCSRecognizers_size = 0; 1.69 + } 1.70 + gCSRecognizersInitOnce.reset(); 1.71 + 1.72 + return TRUE; 1.73 +} 1.74 + 1.75 +static int32_t U_CALLCONV 1.76 +charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 1.77 +{ 1.78 + U_NAMESPACE_USE 1.79 + 1.80 + const CharsetMatch **csm_l = (const CharsetMatch **) left; 1.81 + const CharsetMatch **csm_r = (const CharsetMatch **) right; 1.82 + 1.83 + // NOTE: compare is backwards to sort from highest to lowest. 1.84 + return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 1.85 +} 1.86 + 1.87 +static void U_CALLCONV initRecognizers(UErrorCode &status) { 1.88 + U_NAMESPACE_USE 1.89 + ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 1.90 + CSRecognizerInfo *tempArray[] = { 1.91 + new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), 1.92 + 1.93 + new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), 1.94 + new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), 1.95 + new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), 1.96 + new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), 1.97 + 1.98 + new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), 1.99 + new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), 1.100 + new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), 1.101 + new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), 1.102 + new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), 1.103 + new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), 1.104 + new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), 1.105 + new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), 1.106 + new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), 1.107 + new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), 1.108 + new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), 1.109 + new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), 1.110 + new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), 1.111 + new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), 1.112 + new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), 1.113 + new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), 1.114 + 1.115 + new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), 1.116 + new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), 1.117 + new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), 1.118 + 1.119 + new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), 1.120 + new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), 1.121 + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), 1.122 + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) 1.123 + }; 1.124 + int32_t rCount = ARRAY_SIZE(tempArray); 1.125 + 1.126 + fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); 1.127 + 1.128 + if (fCSRecognizers == NULL) { 1.129 + status = U_MEMORY_ALLOCATION_ERROR; 1.130 + } 1.131 + else { 1.132 + fCSRecognizers_size = rCount; 1.133 + for (int32_t r = 0; r < rCount; r += 1) { 1.134 + fCSRecognizers[r] = tempArray[r]; 1.135 + if (fCSRecognizers[r] == NULL) { 1.136 + status = U_MEMORY_ALLOCATION_ERROR; 1.137 + } 1.138 + } 1.139 + } 1.140 +} 1.141 + 1.142 +U_CDECL_END 1.143 + 1.144 +U_NAMESPACE_BEGIN 1.145 + 1.146 +void CharsetDetector::setRecognizers(UErrorCode &status) 1.147 +{ 1.148 + umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); 1.149 +} 1.150 + 1.151 +CharsetDetector::CharsetDetector(UErrorCode &status) 1.152 + : textIn(new InputText(status)), resultArray(NULL), 1.153 + resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), 1.154 + fEnabledRecognizers(NULL) 1.155 +{ 1.156 + if (U_FAILURE(status)) { 1.157 + return; 1.158 + } 1.159 + 1.160 + setRecognizers(status); 1.161 + 1.162 + if (U_FAILURE(status)) { 1.163 + return; 1.164 + } 1.165 + 1.166 + resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 1.167 + 1.168 + if (resultArray == NULL) { 1.169 + status = U_MEMORY_ALLOCATION_ERROR; 1.170 + return; 1.171 + } 1.172 + 1.173 + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 1.174 + resultArray[i] = new CharsetMatch(); 1.175 + 1.176 + if (resultArray[i] == NULL) { 1.177 + status = U_MEMORY_ALLOCATION_ERROR; 1.178 + break; 1.179 + } 1.180 + } 1.181 +} 1.182 + 1.183 +CharsetDetector::~CharsetDetector() 1.184 +{ 1.185 + delete textIn; 1.186 + 1.187 + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 1.188 + delete resultArray[i]; 1.189 + } 1.190 + 1.191 + uprv_free(resultArray); 1.192 + 1.193 + if (fEnabledRecognizers) { 1.194 + uprv_free(fEnabledRecognizers); 1.195 + } 1.196 +} 1.197 + 1.198 +void CharsetDetector::setText(const char *in, int32_t len) 1.199 +{ 1.200 + textIn->setText(in, len); 1.201 + fFreshTextSet = TRUE; 1.202 +} 1.203 + 1.204 +UBool CharsetDetector::setStripTagsFlag(UBool flag) 1.205 +{ 1.206 + UBool temp = fStripTags; 1.207 + fStripTags = flag; 1.208 + fFreshTextSet = TRUE; 1.209 + return temp; 1.210 +} 1.211 + 1.212 +UBool CharsetDetector::getStripTagsFlag() const 1.213 +{ 1.214 + return fStripTags; 1.215 +} 1.216 + 1.217 +void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 1.218 +{ 1.219 + textIn->setDeclaredEncoding(encoding,len); 1.220 +} 1.221 + 1.222 +int32_t CharsetDetector::getDetectableCount() 1.223 +{ 1.224 + UErrorCode status = U_ZERO_ERROR; 1.225 + 1.226 + setRecognizers(status); 1.227 + 1.228 + return fCSRecognizers_size; 1.229 +} 1.230 + 1.231 +const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 1.232 +{ 1.233 + int32_t maxMatchesFound = 0; 1.234 + 1.235 + detectAll(maxMatchesFound, status); 1.236 + 1.237 + if(maxMatchesFound > 0) { 1.238 + return resultArray[0]; 1.239 + } else { 1.240 + return NULL; 1.241 + } 1.242 +} 1.243 + 1.244 +const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 1.245 +{ 1.246 + if(!textIn->isSet()) { 1.247 + status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 1.248 + 1.249 + return NULL; 1.250 + } else if (fFreshTextSet) { 1.251 + CharsetRecognizer *csr; 1.252 + int32_t i; 1.253 + 1.254 + textIn->MungeInput(fStripTags); 1.255 + 1.256 + // Iterate over all possible charsets, remember all that 1.257 + // give a match quality > 0. 1.258 + resultCount = 0; 1.259 + for (i = 0; i < fCSRecognizers_size; i += 1) { 1.260 + csr = fCSRecognizers[i]->recognizer; 1.261 + if (csr->match(textIn, resultArray[resultCount])) { 1.262 + resultCount++; 1.263 + } 1.264 + } 1.265 + 1.266 + if (resultCount > 1) { 1.267 + uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 1.268 + } 1.269 + fFreshTextSet = FALSE; 1.270 + } 1.271 + 1.272 + maxMatchesFound = resultCount; 1.273 + 1.274 + return resultArray; 1.275 +} 1.276 + 1.277 +void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) 1.278 +{ 1.279 + if (U_FAILURE(status)) { 1.280 + return; 1.281 + } 1.282 + 1.283 + int32_t modIdx = -1; 1.284 + UBool isDefaultVal = FALSE; 1.285 + for (int32_t i = 0; i < fCSRecognizers_size; i++) { 1.286 + CSRecognizerInfo *csrinfo = fCSRecognizers[i]; 1.287 + if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { 1.288 + modIdx = i; 1.289 + isDefaultVal = (csrinfo->isDefaultEnabled == enabled); 1.290 + break; 1.291 + } 1.292 + } 1.293 + if (modIdx < 0) { 1.294 + // No matching encoding found 1.295 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.296 + return; 1.297 + } 1.298 + 1.299 + if (fEnabledRecognizers == NULL && !isDefaultVal) { 1.300 + // Create an array storing the non default setting 1.301 + fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); 1.302 + if (fEnabledRecognizers == NULL) { 1.303 + status = U_MEMORY_ALLOCATION_ERROR; 1.304 + return; 1.305 + } 1.306 + // Initialize the array with default info 1.307 + for (int32_t i = 0; i < fCSRecognizers_size; i++) { 1.308 + fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; 1.309 + } 1.310 + } 1.311 + 1.312 + if (fEnabledRecognizers != NULL) { 1.313 + fEnabledRecognizers[modIdx] = enabled; 1.314 + } 1.315 +} 1.316 + 1.317 +/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 1.318 +{ 1.319 + if( index > fCSRecognizers_size-1 || index < 0) { 1.320 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.321 + 1.322 + return 0; 1.323 + } else { 1.324 + return fCSRecognizers[index]->getName(); 1.325 + } 1.326 +}*/ 1.327 + 1.328 +U_NAMESPACE_END 1.329 + 1.330 +U_CDECL_BEGIN 1.331 +typedef struct { 1.332 + int32_t currIndex; 1.333 + UBool all; 1.334 + UBool *enabledRecognizers; 1.335 +} Context; 1.336 + 1.337 + 1.338 + 1.339 +static void U_CALLCONV 1.340 +enumClose(UEnumeration *en) { 1.341 + if(en->context != NULL) { 1.342 + DELETE_ARRAY(en->context); 1.343 + } 1.344 + 1.345 + DELETE_ARRAY(en); 1.346 +} 1.347 + 1.348 +static int32_t U_CALLCONV 1.349 +enumCount(UEnumeration *en, UErrorCode *) { 1.350 + if (((Context *)en->context)->all) { 1.351 + // ucsdet_getAllDetectableCharsets, all charset detector names 1.352 + return fCSRecognizers_size; 1.353 + } 1.354 + 1.355 + // Otherwise, ucsdet_getDetectableCharsets - only enabled ones 1.356 + int32_t count = 0; 1.357 + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 1.358 + if (enabledArray != NULL) { 1.359 + // custom set 1.360 + for (int32_t i = 0; i < fCSRecognizers_size; i++) { 1.361 + if (enabledArray[i]) { 1.362 + count++; 1.363 + } 1.364 + } 1.365 + } else { 1.366 + // default set 1.367 + for (int32_t i = 0; i < fCSRecognizers_size; i++) { 1.368 + if (fCSRecognizers[i]->isDefaultEnabled) { 1.369 + count++; 1.370 + } 1.371 + } 1.372 + } 1.373 + return count; 1.374 +} 1.375 + 1.376 +static const char* U_CALLCONV 1.377 +enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 1.378 + const char *currName = NULL; 1.379 + 1.380 + if (((Context *)en->context)->currIndex < fCSRecognizers_size) { 1.381 + if (((Context *)en->context)->all) { 1.382 + // ucsdet_getAllDetectableCharsets, all charset detector names 1.383 + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 1.384 + ((Context *)en->context)->currIndex++; 1.385 + } else { 1.386 + // ucsdet_getDetectableCharsets 1.387 + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 1.388 + if (enabledArray != NULL) { 1.389 + // custome set 1.390 + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 1.391 + if (enabledArray[((Context *)en->context)->currIndex]) { 1.392 + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 1.393 + } 1.394 + ((Context *)en->context)->currIndex++; 1.395 + } 1.396 + } else { 1.397 + // default set 1.398 + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 1.399 + if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { 1.400 + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 1.401 + } 1.402 + ((Context *)en->context)->currIndex++; 1.403 + } 1.404 + } 1.405 + } 1.406 + } 1.407 + 1.408 + if(resultLength != NULL) { 1.409 + *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); 1.410 + } 1.411 + 1.412 + return currName; 1.413 +} 1.414 + 1.415 + 1.416 +static void U_CALLCONV 1.417 +enumReset(UEnumeration *en, UErrorCode *) { 1.418 + ((Context *)en->context)->currIndex = 0; 1.419 +} 1.420 + 1.421 +static const UEnumeration gCSDetEnumeration = { 1.422 + NULL, 1.423 + NULL, 1.424 + enumClose, 1.425 + enumCount, 1.426 + uenum_unextDefault, 1.427 + enumNext, 1.428 + enumReset 1.429 +}; 1.430 + 1.431 +U_CDECL_END 1.432 + 1.433 +U_NAMESPACE_BEGIN 1.434 + 1.435 +UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) 1.436 +{ 1.437 + 1.438 + /* Initialize recognized charsets. */ 1.439 + setRecognizers(status); 1.440 + 1.441 + if(U_FAILURE(status)) { 1.442 + return 0; 1.443 + } 1.444 + 1.445 + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 1.446 + if (en == NULL) { 1.447 + status = U_MEMORY_ALLOCATION_ERROR; 1.448 + return 0; 1.449 + } 1.450 + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 1.451 + en->context = (void*)NEW_ARRAY(Context, 1); 1.452 + if (en->context == NULL) { 1.453 + status = U_MEMORY_ALLOCATION_ERROR; 1.454 + DELETE_ARRAY(en); 1.455 + return 0; 1.456 + } 1.457 + uprv_memset(en->context, 0, sizeof(Context)); 1.458 + ((Context*)en->context)->all = TRUE; 1.459 + return en; 1.460 +} 1.461 + 1.462 +UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const 1.463 +{ 1.464 + if(U_FAILURE(status)) { 1.465 + return 0; 1.466 + } 1.467 + 1.468 + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 1.469 + if (en == NULL) { 1.470 + status = U_MEMORY_ALLOCATION_ERROR; 1.471 + return 0; 1.472 + } 1.473 + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 1.474 + en->context = (void*)NEW_ARRAY(Context, 1); 1.475 + if (en->context == NULL) { 1.476 + status = U_MEMORY_ALLOCATION_ERROR; 1.477 + DELETE_ARRAY(en); 1.478 + return 0; 1.479 + } 1.480 + uprv_memset(en->context, 0, sizeof(Context)); 1.481 + ((Context*)en->context)->all = FALSE; 1.482 + ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; 1.483 + return en; 1.484 +} 1.485 + 1.486 +U_NAMESPACE_END 1.487 + 1.488 +#endif