michael@0: /*
michael@0: **********************************************************************
michael@0: *   Copyright (C) 2012-2013, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: **********************************************************************
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #include "unicode/uchar.h"
michael@0: #include "unicode/utf16.h"
michael@0: 
michael@0: #include "identifier_info.h"
michael@0: #include "mutex.h"
michael@0: #include "scriptset.h"
michael@0: #include "ucln_in.h"
michael@0: #include "uvector.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0: 
michael@0: static UMutex gInitMutex = U_MUTEX_INITIALIZER;
michael@0: static UBool gStaticsAreInitialized = FALSE;
michael@0: 
michael@0: UnicodeSet *IdentifierInfo::ASCII;
michael@0: ScriptSet *IdentifierInfo::JAPANESE;
michael@0: ScriptSet *IdentifierInfo::CHINESE;
michael@0: ScriptSet *IdentifierInfo::KOREAN;
michael@0: ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
michael@0: 
michael@0: UBool IdentifierInfo::cleanup() {
michael@0:     delete ASCII;
michael@0:     ASCII = NULL;
michael@0:     delete JAPANESE;
michael@0:     JAPANESE = NULL;
michael@0:     delete CHINESE;
michael@0:     CHINESE = NULL;
michael@0:     delete KOREAN;
michael@0:     KOREAN = NULL;
michael@0:     delete CONFUSABLE_WITH_LATIN;
michael@0:     CONFUSABLE_WITH_LATIN = NULL;
michael@0:     gStaticsAreInitialized = FALSE;
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: U_CDECL_BEGIN
michael@0: static UBool U_CALLCONV
michael@0: IdentifierInfo_cleanup(void) {
michael@0:     return IdentifierInfo::cleanup();
michael@0: }
michael@0: U_CDECL_END
michael@0: 
michael@0: 
michael@0: IdentifierInfo::IdentifierInfo(UErrorCode &status):
michael@0:          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 
michael@0:          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     {
michael@0:         Mutex lock(&gInitMutex);
michael@0:         if (!gStaticsAreInitialized) {
michael@0:             ASCII    = new UnicodeSet(0, 0x7f);
michael@0:             JAPANESE = new ScriptSet();
michael@0:             CHINESE  = new ScriptSet();
michael@0:             KOREAN   = new ScriptSet();
michael@0:             CONFUSABLE_WITH_LATIN = new ScriptSet();
michael@0:             if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 
michael@0:                     || CONFUSABLE_WITH_LATIN == NULL) {
michael@0:                 status = U_MEMORY_ALLOCATION_ERROR;
michael@0:                 return;
michael@0:             }
michael@0:             ASCII->freeze();
michael@0:             JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
michael@0:                      .set(USCRIPT_KATAKANA, status);
michael@0:             CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
michael@0:             KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
michael@0:             CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
michael@0:                       .set(USCRIPT_CHEROKEE, status);
michael@0:             ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
michael@0:             gStaticsAreInitialized = TRUE;
michael@0:         }
michael@0:     }
michael@0:     fIdentifier = new UnicodeString();
michael@0:     fRequiredScripts = new ScriptSet();
michael@0:     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
michael@0:     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
michael@0:     fCommonAmongAlternates = new ScriptSet();
michael@0:     fNumerics = new UnicodeSet();
michael@0:     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
michael@0: 
michael@0:     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
michael@0:                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
michael@0:         status = U_MEMORY_ALLOCATION_ERROR;
michael@0:     }
michael@0: }
michael@0: 
michael@0: IdentifierInfo::~IdentifierInfo() {
michael@0:     delete fIdentifier;
michael@0:     delete fRequiredScripts;
michael@0:     uhash_close(fScriptSetSet);
michael@0:     delete fCommonAmongAlternates;
michael@0:     delete fNumerics;
michael@0:     delete fIdentifierProfile;
michael@0: }
michael@0: 
michael@0: 
michael@0: IdentifierInfo &IdentifierInfo::clear() {
michael@0:     fRequiredScripts->resetAll();
michael@0:     uhash_removeAll(fScriptSetSet);
michael@0:     fNumerics->clear();
michael@0:     fCommonAmongAlternates->resetAll();
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: 
michael@0: IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
michael@0:     *fIdentifierProfile = identifierProfile;
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: 
michael@0: const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
michael@0:     return *fIdentifierProfile;
michael@0: }
michael@0: 
michael@0: 
michael@0: IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return *this;
michael@0:     }
michael@0:     *fIdentifier = identifier;
michael@0:     clear();
michael@0:     ScriptSet scriptsForCP;
michael@0:     UChar32 cp;
michael@0:     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
michael@0:         cp = identifier.char32At(i);
michael@0:         // Store a representative character for each kind of decimal digit
michael@0:         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
michael@0:             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
michael@0:             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
michael@0:         }
michael@0:         UScriptCode extensions[500];
michael@0:         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
michael@0:         if (U_FAILURE(status)) {
michael@0:             return *this;
michael@0:         }
michael@0:         scriptsForCP.resetAll();
michael@0:         for (int32_t j=0; j<extensionsCount; j++) {
michael@0:             scriptsForCP.set(extensions[j], status);
michael@0:         }
michael@0:         scriptsForCP.reset(USCRIPT_COMMON, status);
michael@0:         scriptsForCP.reset(USCRIPT_INHERITED, status);
michael@0:         switch (scriptsForCP.countMembers()) {
michael@0:           case 0: break;
michael@0:           case 1:
michael@0:             // Single script, record it.
michael@0:             fRequiredScripts->Union(scriptsForCP);
michael@0:             break;
michael@0:           default:
michael@0:             if (!fRequiredScripts->intersects(scriptsForCP) 
michael@0:                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
michael@0:                 // If the set hasn't been added already, add it
michael@0:                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
michael@0:                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
michael@0:             }
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
michael@0:     // [Kana], [Kana Hira] => [Kana]
michael@0:     // This is relatively infrequent, so doesn't have to be optimized.
michael@0:     // We also compute any commonalities among the alternates.
michael@0:     if (uhash_count(fScriptSetSet) > 0) {
michael@0:         fCommonAmongAlternates->setAll();
michael@0:         for (int32_t it = -1;;) {
michael@0:             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
michael@0:             if (nextHashEl == NULL) {
michael@0:                 break;
michael@0:             }
michael@0:             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
michael@0:             // [Kana], [Kana Hira] => [Kana]
michael@0:             if (fRequiredScripts->intersects(*next)) {
michael@0:                 uhash_removeElement(fScriptSetSet, nextHashEl);
michael@0:             } else {
michael@0:                 fCommonAmongAlternates->intersect(*next);
michael@0:                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
michael@0:                 for (int32_t otherIt = -1;;) {
michael@0:                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
michael@0:                     if (otherHashEl == NULL) {
michael@0:                         break;
michael@0:                     }
michael@0:                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
michael@0:                     if (next != other && next->contains(*other)) {
michael@0:                         uhash_removeElement(fScriptSetSet, nextHashEl);
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     if (uhash_count(fScriptSetSet) == 0) {
michael@0:         fCommonAmongAlternates->resetAll();
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: 
michael@0: const UnicodeString *IdentifierInfo::getIdentifier() const {
michael@0:     return fIdentifier;
michael@0: }
michael@0: 
michael@0: const ScriptSet *IdentifierInfo::getScripts() const {
michael@0:     return fRequiredScripts;
michael@0: }
michael@0: 
michael@0: const UHashtable *IdentifierInfo::getAlternates() const {
michael@0:     return fScriptSetSet;
michael@0: }
michael@0: 
michael@0: 
michael@0: const UnicodeSet *IdentifierInfo::getNumerics() const {
michael@0:     return fNumerics;
michael@0: }
michael@0: 
michael@0: const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
michael@0:     return fCommonAmongAlternates;
michael@0: }
michael@0: 
michael@0: #if !UCONFIG_NO_NORMALIZATION
michael@0: 
michael@0: URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
michael@0:     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
michael@0:         return USPOOF_UNRESTRICTIVE;
michael@0:     }
michael@0:     if (ASCII->containsAll(*fIdentifier)) {
michael@0:         return USPOOF_ASCII;
michael@0:     }
michael@0:     // This is a bit tricky. We look at a number of factors.
michael@0:     // The number of scripts in the text.
michael@0:     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
michael@0:     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
michael@0: 
michael@0:     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
michael@0:     //       time it is created, in setIdentifier().
michael@0:     int32_t cardinalityPlus = fRequiredScripts->countMembers() + 
michael@0:             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
michael@0:     if (cardinalityPlus < 2) {
michael@0:         return USPOOF_HIGHLY_RESTRICTIVE;
michael@0:     }
michael@0:     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
michael@0:             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
michael@0:         return USPOOF_HIGHLY_RESTRICTIVE;
michael@0:     }
michael@0:     if (cardinalityPlus == 2 && 
michael@0:             fRequiredScripts->test(USCRIPT_LATIN, status) && 
michael@0:             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
michael@0:         return USPOOF_MODERATELY_RESTRICTIVE;
michael@0:     }
michael@0:     return USPOOF_MINIMALLY_RESTRICTIVE;
michael@0: }
michael@0: 
michael@0: #endif /* !UCONFIG_NO_NORMALIZATION */
michael@0: 
michael@0: int32_t IdentifierInfo::getScriptCount() const {
michael@0:     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
michael@0:     int32_t count = fRequiredScripts->countMembers() +
michael@0:             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
michael@0:     return count;
michael@0: }
michael@0:     
michael@0: 
michael@0: 
michael@0: UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
michael@0:     if (!container.contains(containee)) {
michael@0:         return FALSE;
michael@0:     }
michael@0:     for (int32_t iter = -1; ;) {
michael@0:         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
michael@0:         if (hashEl == NULL) {
michael@0:             break;
michael@0:         }
michael@0:         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
michael@0:         if (!container.intersects(*alternatives)) {
michael@0:             return false;
michael@0:         }
michael@0:     }
michael@0:     return true;
michael@0: }
michael@0: 
michael@0: UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
michael@0:     UVector sorted(status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return dest;
michael@0:     }
michael@0:     for (int32_t pos = -1; ;) {
michael@0:         const UHashElement *el = uhash_nextElement(alternates, &pos);
michael@0:         if (el == NULL) {
michael@0:             break;
michael@0:         }
michael@0:         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
michael@0:         sorted.addElement(ss, status);
michael@0:     }
michael@0:     sorted.sort(uhash_compareScriptSet, status);
michael@0:     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
michael@0:     for (int32_t i=0; i<sorted.size(); i++) {
michael@0:         if (i>0) {
michael@0:             dest.append(separator);
michael@0:         }
michael@0:         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
michael@0:         ss->displayScripts(dest);
michael@0:     }
michael@0:     return dest;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: