michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2005-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "unicode/ucsdet.h" michael@0: michael@0: #include "csdetect.h" michael@0: #include "csmatch.h" michael@0: #include "uenumimp.h" michael@0: michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "umutex.h" michael@0: #include "ucln_in.h" michael@0: #include "uarrsort.h" michael@0: #include "inputext.h" michael@0: #include "csrsbcs.h" michael@0: #include "csrmbcs.h" michael@0: #include "csrutf8.h" michael@0: #include "csrucode.h" michael@0: #include "csr2022.h" michael@0: michael@0: #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) michael@0: michael@0: #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) michael@0: #define DELETE_ARRAY(array) uprv_free((void *) (array)) michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: struct CSRecognizerInfo : public UMemory { michael@0: CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) michael@0: : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; michael@0: michael@0: ~CSRecognizerInfo() {delete recognizer;}; michael@0: michael@0: CharsetRecognizer *recognizer; michael@0: UBool isDefaultEnabled; michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: static icu::CSRecognizerInfo **fCSRecognizers = NULL; michael@0: static icu::UInitOnce gCSRecognizersInitOnce; michael@0: static int32_t fCSRecognizers_size = 0; michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV csdet_cleanup(void) michael@0: { michael@0: U_NAMESPACE_USE michael@0: if (fCSRecognizers != NULL) { michael@0: for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { michael@0: delete fCSRecognizers[r]; michael@0: fCSRecognizers[r] = NULL; michael@0: } michael@0: michael@0: DELETE_ARRAY(fCSRecognizers); michael@0: fCSRecognizers = NULL; michael@0: fCSRecognizers_size = 0; michael@0: } michael@0: gCSRecognizersInitOnce.reset(); michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static int32_t U_CALLCONV michael@0: charsetMatchComparator(const void * /*context*/, const void *left, const void *right) michael@0: { michael@0: U_NAMESPACE_USE michael@0: michael@0: const CharsetMatch **csm_l = (const CharsetMatch **) left; michael@0: const CharsetMatch **csm_r = (const CharsetMatch **) right; michael@0: michael@0: // NOTE: compare is backwards to sort from highest to lowest. michael@0: return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); michael@0: } michael@0: michael@0: static void U_CALLCONV initRecognizers(UErrorCode &status) { michael@0: U_NAMESPACE_USE michael@0: ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); michael@0: CSRecognizerInfo *tempArray[] = { michael@0: new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), michael@0: michael@0: new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), michael@0: michael@0: new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), michael@0: michael@0: new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), michael@0: new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), michael@0: michael@0: new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), michael@0: new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), michael@0: new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), michael@0: new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) michael@0: }; michael@0: int32_t rCount = ARRAY_SIZE(tempArray); michael@0: michael@0: fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); michael@0: michael@0: if (fCSRecognizers == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: else { michael@0: fCSRecognizers_size = rCount; michael@0: for (int32_t r = 0; r < rCount; r += 1) { michael@0: fCSRecognizers[r] = tempArray[r]; michael@0: if (fCSRecognizers[r] == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: void CharsetDetector::setRecognizers(UErrorCode &status) michael@0: { michael@0: umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); michael@0: } michael@0: michael@0: CharsetDetector::CharsetDetector(UErrorCode &status) michael@0: : textIn(new InputText(status)), resultArray(NULL), michael@0: resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), michael@0: fEnabledRecognizers(NULL) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: setRecognizers(status); michael@0: michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); michael@0: michael@0: if (resultArray == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { michael@0: resultArray[i] = new CharsetMatch(); michael@0: michael@0: if (resultArray[i] == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: CharsetDetector::~CharsetDetector() michael@0: { michael@0: delete textIn; michael@0: michael@0: for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { michael@0: delete resultArray[i]; michael@0: } michael@0: michael@0: uprv_free(resultArray); michael@0: michael@0: if (fEnabledRecognizers) { michael@0: uprv_free(fEnabledRecognizers); michael@0: } michael@0: } michael@0: michael@0: void CharsetDetector::setText(const char *in, int32_t len) michael@0: { michael@0: textIn->setText(in, len); michael@0: fFreshTextSet = TRUE; michael@0: } michael@0: michael@0: UBool CharsetDetector::setStripTagsFlag(UBool flag) michael@0: { michael@0: UBool temp = fStripTags; michael@0: fStripTags = flag; michael@0: fFreshTextSet = TRUE; michael@0: return temp; michael@0: } michael@0: michael@0: UBool CharsetDetector::getStripTagsFlag() const michael@0: { michael@0: return fStripTags; michael@0: } michael@0: michael@0: void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const michael@0: { michael@0: textIn->setDeclaredEncoding(encoding,len); michael@0: } michael@0: michael@0: int32_t CharsetDetector::getDetectableCount() michael@0: { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: michael@0: setRecognizers(status); michael@0: michael@0: return fCSRecognizers_size; michael@0: } michael@0: michael@0: const CharsetMatch *CharsetDetector::detect(UErrorCode &status) michael@0: { michael@0: int32_t maxMatchesFound = 0; michael@0: michael@0: detectAll(maxMatchesFound, status); michael@0: michael@0: if(maxMatchesFound > 0) { michael@0: return resultArray[0]; michael@0: } else { michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) michael@0: { michael@0: if(!textIn->isSet()) { michael@0: status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set michael@0: michael@0: return NULL; michael@0: } else if (fFreshTextSet) { michael@0: CharsetRecognizer *csr; michael@0: int32_t i; michael@0: michael@0: textIn->MungeInput(fStripTags); michael@0: michael@0: // Iterate over all possible charsets, remember all that michael@0: // give a match quality > 0. michael@0: resultCount = 0; michael@0: for (i = 0; i < fCSRecognizers_size; i += 1) { michael@0: csr = fCSRecognizers[i]->recognizer; michael@0: if (csr->match(textIn, resultArray[resultCount])) { michael@0: resultCount++; michael@0: } michael@0: } michael@0: michael@0: if (resultCount > 1) { michael@0: uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); michael@0: } michael@0: fFreshTextSet = FALSE; michael@0: } michael@0: michael@0: maxMatchesFound = resultCount; michael@0: michael@0: return resultArray; michael@0: } michael@0: michael@0: void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: int32_t modIdx = -1; michael@0: UBool isDefaultVal = FALSE; michael@0: for (int32_t i = 0; i < fCSRecognizers_size; i++) { michael@0: CSRecognizerInfo *csrinfo = fCSRecognizers[i]; michael@0: if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { michael@0: modIdx = i; michael@0: isDefaultVal = (csrinfo->isDefaultEnabled == enabled); michael@0: break; michael@0: } michael@0: } michael@0: if (modIdx < 0) { michael@0: // No matching encoding found michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: if (fEnabledRecognizers == NULL && !isDefaultVal) { michael@0: // Create an array storing the non default setting michael@0: fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); michael@0: if (fEnabledRecognizers == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: // Initialize the array with default info michael@0: for (int32_t i = 0; i < fCSRecognizers_size; i++) { michael@0: fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; michael@0: } michael@0: } michael@0: michael@0: if (fEnabledRecognizers != NULL) { michael@0: fEnabledRecognizers[modIdx] = enabled; michael@0: } michael@0: } michael@0: michael@0: /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const michael@0: { michael@0: if( index > fCSRecognizers_size-1 || index < 0) { michael@0: status = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: michael@0: return 0; michael@0: } else { michael@0: return fCSRecognizers[index]->getName(); michael@0: } michael@0: }*/ michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: U_CDECL_BEGIN michael@0: typedef struct { michael@0: int32_t currIndex; michael@0: UBool all; michael@0: UBool *enabledRecognizers; michael@0: } Context; michael@0: michael@0: michael@0: michael@0: static void U_CALLCONV michael@0: enumClose(UEnumeration *en) { michael@0: if(en->context != NULL) { michael@0: DELETE_ARRAY(en->context); michael@0: } michael@0: michael@0: DELETE_ARRAY(en); michael@0: } michael@0: michael@0: static int32_t U_CALLCONV michael@0: enumCount(UEnumeration *en, UErrorCode *) { michael@0: if (((Context *)en->context)->all) { michael@0: // ucsdet_getAllDetectableCharsets, all charset detector names michael@0: return fCSRecognizers_size; michael@0: } michael@0: michael@0: // Otherwise, ucsdet_getDetectableCharsets - only enabled ones michael@0: int32_t count = 0; michael@0: UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; michael@0: if (enabledArray != NULL) { michael@0: // custom set michael@0: for (int32_t i = 0; i < fCSRecognizers_size; i++) { michael@0: if (enabledArray[i]) { michael@0: count++; michael@0: } michael@0: } michael@0: } else { michael@0: // default set michael@0: for (int32_t i = 0; i < fCSRecognizers_size; i++) { michael@0: if (fCSRecognizers[i]->isDefaultEnabled) { michael@0: count++; michael@0: } michael@0: } michael@0: } michael@0: return count; michael@0: } michael@0: michael@0: static const char* U_CALLCONV michael@0: enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { michael@0: const char *currName = NULL; michael@0: michael@0: if (((Context *)en->context)->currIndex < fCSRecognizers_size) { michael@0: if (((Context *)en->context)->all) { michael@0: // ucsdet_getAllDetectableCharsets, all charset detector names michael@0: currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); michael@0: ((Context *)en->context)->currIndex++; michael@0: } else { michael@0: // ucsdet_getDetectableCharsets michael@0: UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; michael@0: if (enabledArray != NULL) { michael@0: // custome set michael@0: while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { michael@0: if (enabledArray[((Context *)en->context)->currIndex]) { michael@0: currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); michael@0: } michael@0: ((Context *)en->context)->currIndex++; michael@0: } michael@0: } else { michael@0: // default set michael@0: while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { michael@0: if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { michael@0: currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); michael@0: } michael@0: ((Context *)en->context)->currIndex++; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(resultLength != NULL) { michael@0: *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); michael@0: } michael@0: michael@0: return currName; michael@0: } michael@0: michael@0: michael@0: static void U_CALLCONV michael@0: enumReset(UEnumeration *en, UErrorCode *) { michael@0: ((Context *)en->context)->currIndex = 0; michael@0: } michael@0: michael@0: static const UEnumeration gCSDetEnumeration = { michael@0: NULL, michael@0: NULL, michael@0: enumClose, michael@0: enumCount, michael@0: uenum_unextDefault, michael@0: enumNext, michael@0: enumReset michael@0: }; michael@0: michael@0: U_CDECL_END michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) michael@0: { michael@0: michael@0: /* Initialize recognized charsets. */ michael@0: setRecognizers(status); michael@0: michael@0: if(U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: michael@0: UEnumeration *en = NEW_ARRAY(UEnumeration, 1); michael@0: if (en == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); michael@0: en->context = (void*)NEW_ARRAY(Context, 1); michael@0: if (en->context == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: DELETE_ARRAY(en); michael@0: return 0; michael@0: } michael@0: uprv_memset(en->context, 0, sizeof(Context)); michael@0: ((Context*)en->context)->all = TRUE; michael@0: return en; michael@0: } michael@0: michael@0: UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const michael@0: { michael@0: if(U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: michael@0: UEnumeration *en = NEW_ARRAY(UEnumeration, 1); michael@0: if (en == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); michael@0: en->context = (void*)NEW_ARRAY(Context, 1); michael@0: if (en->context == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: DELETE_ARRAY(en); michael@0: return 0; michael@0: } michael@0: uprv_memset(en->context, 0, sizeof(Context)); michael@0: ((Context*)en->context)->all = FALSE; michael@0: ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; michael@0: return en; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif