michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2012-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/utf16.h" michael@0: michael@0: #include "identifier_info.h" michael@0: #include "mutex.h" michael@0: #include "scriptset.h" michael@0: #include "ucln_in.h" michael@0: #include "uvector.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: static UMutex gInitMutex = U_MUTEX_INITIALIZER; michael@0: static UBool gStaticsAreInitialized = FALSE; michael@0: michael@0: UnicodeSet *IdentifierInfo::ASCII; michael@0: ScriptSet *IdentifierInfo::JAPANESE; michael@0: ScriptSet *IdentifierInfo::CHINESE; michael@0: ScriptSet *IdentifierInfo::KOREAN; michael@0: ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; michael@0: michael@0: UBool IdentifierInfo::cleanup() { michael@0: delete ASCII; michael@0: ASCII = NULL; michael@0: delete JAPANESE; michael@0: JAPANESE = NULL; michael@0: delete CHINESE; michael@0: CHINESE = NULL; michael@0: delete KOREAN; michael@0: KOREAN = NULL; michael@0: delete CONFUSABLE_WITH_LATIN; michael@0: CONFUSABLE_WITH_LATIN = NULL; michael@0: gStaticsAreInitialized = FALSE; michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV michael@0: IdentifierInfo_cleanup(void) { michael@0: return IdentifierInfo::cleanup(); michael@0: } michael@0: U_CDECL_END michael@0: michael@0: michael@0: IdentifierInfo::IdentifierInfo(UErrorCode &status): michael@0: fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), michael@0: fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: { michael@0: Mutex lock(&gInitMutex); michael@0: if (!gStaticsAreInitialized) { michael@0: ASCII = new UnicodeSet(0, 0x7f); michael@0: JAPANESE = new ScriptSet(); michael@0: CHINESE = new ScriptSet(); michael@0: KOREAN = new ScriptSet(); michael@0: CONFUSABLE_WITH_LATIN = new ScriptSet(); michael@0: if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL michael@0: || CONFUSABLE_WITH_LATIN == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: ASCII->freeze(); michael@0: JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) michael@0: .set(USCRIPT_KATAKANA, status); michael@0: CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); michael@0: KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); michael@0: CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) michael@0: .set(USCRIPT_CHEROKEE, status); michael@0: ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); michael@0: gStaticsAreInitialized = TRUE; michael@0: } michael@0: } michael@0: fIdentifier = new UnicodeString(); michael@0: fRequiredScripts = new ScriptSet(); michael@0: fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); michael@0: uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); michael@0: fCommonAmongAlternates = new ScriptSet(); michael@0: fNumerics = new UnicodeSet(); michael@0: fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); michael@0: michael@0: if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || michael@0: fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: IdentifierInfo::~IdentifierInfo() { michael@0: delete fIdentifier; michael@0: delete fRequiredScripts; michael@0: uhash_close(fScriptSetSet); michael@0: delete fCommonAmongAlternates; michael@0: delete fNumerics; michael@0: delete fIdentifierProfile; michael@0: } michael@0: michael@0: michael@0: IdentifierInfo &IdentifierInfo::clear() { michael@0: fRequiredScripts->resetAll(); michael@0: uhash_removeAll(fScriptSetSet); michael@0: fNumerics->clear(); michael@0: fCommonAmongAlternates->resetAll(); michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { michael@0: *fIdentifierProfile = identifierProfile; michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { michael@0: return *fIdentifierProfile; michael@0: } michael@0: michael@0: michael@0: IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: *fIdentifier = identifier; michael@0: clear(); michael@0: ScriptSet scriptsForCP; michael@0: UChar32 cp; michael@0: for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { michael@0: cp = identifier.char32At(i); michael@0: // Store a representative character for each kind of decimal digit michael@0: if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { michael@0: // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value michael@0: fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); michael@0: } michael@0: UScriptCode extensions[500]; michael@0: int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: scriptsForCP.resetAll(); michael@0: for (int32_t j=0; jUnion(scriptsForCP); michael@0: break; michael@0: default: michael@0: if (!fRequiredScripts->intersects(scriptsForCP) michael@0: && !uhash_geti(fScriptSetSet, &scriptsForCP)) { michael@0: // If the set hasn't been added already, add it michael@0: // (Add a copy, fScriptSetSet takes ownership of the copy.) michael@0: uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: // Now make a final pass through ScriptSetSet to remove alternates that came before singles. michael@0: // [Kana], [Kana Hira] => [Kana] michael@0: // This is relatively infrequent, so doesn't have to be optimized. michael@0: // We also compute any commonalities among the alternates. michael@0: if (uhash_count(fScriptSetSet) > 0) { michael@0: fCommonAmongAlternates->setAll(); michael@0: for (int32_t it = -1;;) { michael@0: const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); michael@0: if (nextHashEl == NULL) { michael@0: break; michael@0: } michael@0: ScriptSet *next = static_cast(nextHashEl->key.pointer); michael@0: // [Kana], [Kana Hira] => [Kana] michael@0: if (fRequiredScripts->intersects(*next)) { michael@0: uhash_removeElement(fScriptSetSet, nextHashEl); michael@0: } else { michael@0: fCommonAmongAlternates->intersect(*next); michael@0: // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] michael@0: for (int32_t otherIt = -1;;) { michael@0: const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); michael@0: if (otherHashEl == NULL) { michael@0: break; michael@0: } michael@0: ScriptSet *other = static_cast(otherHashEl->key.pointer); michael@0: if (next != other && next->contains(*other)) { michael@0: uhash_removeElement(fScriptSetSet, nextHashEl); michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: if (uhash_count(fScriptSetSet) == 0) { michael@0: fCommonAmongAlternates->resetAll(); michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: const UnicodeString *IdentifierInfo::getIdentifier() const { michael@0: return fIdentifier; michael@0: } michael@0: michael@0: const ScriptSet *IdentifierInfo::getScripts() const { michael@0: return fRequiredScripts; michael@0: } michael@0: michael@0: const UHashtable *IdentifierInfo::getAlternates() const { michael@0: return fScriptSetSet; michael@0: } michael@0: michael@0: michael@0: const UnicodeSet *IdentifierInfo::getNumerics() const { michael@0: return fNumerics; michael@0: } michael@0: michael@0: const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { michael@0: return fCommonAmongAlternates; michael@0: } michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { michael@0: if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { michael@0: return USPOOF_UNRESTRICTIVE; michael@0: } michael@0: if (ASCII->containsAll(*fIdentifier)) { michael@0: return USPOOF_ASCII; michael@0: } michael@0: // This is a bit tricky. We look at a number of factors. michael@0: // The number of scripts in the text. michael@0: // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) michael@0: // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) michael@0: michael@0: // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the michael@0: // time it is created, in setIdentifier(). michael@0: int32_t cardinalityPlus = fRequiredScripts->countMembers() + michael@0: (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); michael@0: if (cardinalityPlus < 2) { michael@0: return USPOOF_HIGHLY_RESTRICTIVE; michael@0: } michael@0: if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) michael@0: || containsWithAlternates(*KOREAN, *fRequiredScripts)) { michael@0: return USPOOF_HIGHLY_RESTRICTIVE; michael@0: } michael@0: if (cardinalityPlus == 2 && michael@0: fRequiredScripts->test(USCRIPT_LATIN, status) && michael@0: !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { michael@0: return USPOOF_MODERATELY_RESTRICTIVE; michael@0: } michael@0: return USPOOF_MINIMALLY_RESTRICTIVE; michael@0: } michael@0: michael@0: #endif /* !UCONFIG_NO_NORMALIZATION */ michael@0: michael@0: int32_t IdentifierInfo::getScriptCount() const { michael@0: // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. michael@0: int32_t count = fRequiredScripts->countMembers() + michael@0: (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); michael@0: return count; michael@0: } michael@0: michael@0: michael@0: michael@0: UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { michael@0: if (!container.contains(containee)) { michael@0: return FALSE; michael@0: } michael@0: for (int32_t iter = -1; ;) { michael@0: const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); michael@0: if (hashEl == NULL) { michael@0: break; michael@0: } michael@0: ScriptSet *alternatives = static_cast(hashEl->key.pointer); michael@0: if (!container.intersects(*alternatives)) { michael@0: return false; michael@0: } michael@0: } michael@0: return true; michael@0: } michael@0: michael@0: UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { michael@0: UVector sorted(status); michael@0: if (U_FAILURE(status)) { michael@0: return dest; michael@0: } michael@0: for (int32_t pos = -1; ;) { michael@0: const UHashElement *el = uhash_nextElement(alternates, &pos); michael@0: if (el == NULL) { michael@0: break; michael@0: } michael@0: ScriptSet *ss = static_cast(el->key.pointer); michael@0: sorted.addElement(ss, status); michael@0: } michael@0: sorted.sort(uhash_compareScriptSet, status); michael@0: UnicodeString separator = UNICODE_STRING_SIMPLE("; "); michael@0: for (int32_t i=0; i0) { michael@0: dest.append(separator); michael@0: } michael@0: ScriptSet *ss = static_cast(sorted.elementAt(i)); michael@0: ss->displayScripts(dest); michael@0: } michael@0: return dest; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: