intl/icu/source/i18n/identifier_info.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2012-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 */
     8 #include "unicode/utypes.h"
    10 #include "unicode/uchar.h"
    11 #include "unicode/utf16.h"
    13 #include "identifier_info.h"
    14 #include "mutex.h"
    15 #include "scriptset.h"
    16 #include "ucln_in.h"
    17 #include "uvector.h"
    19 U_NAMESPACE_BEGIN
    21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    23 static UMutex gInitMutex = U_MUTEX_INITIALIZER;
    24 static UBool gStaticsAreInitialized = FALSE;
    26 UnicodeSet *IdentifierInfo::ASCII;
    27 ScriptSet *IdentifierInfo::JAPANESE;
    28 ScriptSet *IdentifierInfo::CHINESE;
    29 ScriptSet *IdentifierInfo::KOREAN;
    30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
    32 UBool IdentifierInfo::cleanup() {
    33     delete ASCII;
    34     ASCII = NULL;
    35     delete JAPANESE;
    36     JAPANESE = NULL;
    37     delete CHINESE;
    38     CHINESE = NULL;
    39     delete KOREAN;
    40     KOREAN = NULL;
    41     delete CONFUSABLE_WITH_LATIN;
    42     CONFUSABLE_WITH_LATIN = NULL;
    43     gStaticsAreInitialized = FALSE;
    44     return TRUE;
    45 }
    47 U_CDECL_BEGIN
    48 static UBool U_CALLCONV
    49 IdentifierInfo_cleanup(void) {
    50     return IdentifierInfo::cleanup();
    51 }
    52 U_CDECL_END
    55 IdentifierInfo::IdentifierInfo(UErrorCode &status):
    56          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 
    57          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
    58     if (U_FAILURE(status)) {
    59         return;
    60     }
    61     {
    62         Mutex lock(&gInitMutex);
    63         if (!gStaticsAreInitialized) {
    64             ASCII    = new UnicodeSet(0, 0x7f);
    65             JAPANESE = new ScriptSet();
    66             CHINESE  = new ScriptSet();
    67             KOREAN   = new ScriptSet();
    68             CONFUSABLE_WITH_LATIN = new ScriptSet();
    69             if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 
    70                     || CONFUSABLE_WITH_LATIN == NULL) {
    71                 status = U_MEMORY_ALLOCATION_ERROR;
    72                 return;
    73             }
    74             ASCII->freeze();
    75             JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
    76                      .set(USCRIPT_KATAKANA, status);
    77             CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
    78             KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
    79             CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
    80                       .set(USCRIPT_CHEROKEE, status);
    81             ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
    82             gStaticsAreInitialized = TRUE;
    83         }
    84     }
    85     fIdentifier = new UnicodeString();
    86     fRequiredScripts = new ScriptSet();
    87     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
    88     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
    89     fCommonAmongAlternates = new ScriptSet();
    90     fNumerics = new UnicodeSet();
    91     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
    93     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
    94                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
    95         status = U_MEMORY_ALLOCATION_ERROR;
    96     }
    97 }
    99 IdentifierInfo::~IdentifierInfo() {
   100     delete fIdentifier;
   101     delete fRequiredScripts;
   102     uhash_close(fScriptSetSet);
   103     delete fCommonAmongAlternates;
   104     delete fNumerics;
   105     delete fIdentifierProfile;
   106 }
   109 IdentifierInfo &IdentifierInfo::clear() {
   110     fRequiredScripts->resetAll();
   111     uhash_removeAll(fScriptSetSet);
   112     fNumerics->clear();
   113     fCommonAmongAlternates->resetAll();
   114     return *this;
   115 }
   118 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
   119     *fIdentifierProfile = identifierProfile;
   120     return *this;
   121 }
   124 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
   125     return *fIdentifierProfile;
   126 }
   129 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
   130     if (U_FAILURE(status)) {
   131         return *this;
   132     }
   133     *fIdentifier = identifier;
   134     clear();
   135     ScriptSet scriptsForCP;
   136     UChar32 cp;
   137     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
   138         cp = identifier.char32At(i);
   139         // Store a representative character for each kind of decimal digit
   140         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
   141             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
   142             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
   143         }
   144         UScriptCode extensions[500];
   145         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
   146         if (U_FAILURE(status)) {
   147             return *this;
   148         }
   149         scriptsForCP.resetAll();
   150         for (int32_t j=0; j<extensionsCount; j++) {
   151             scriptsForCP.set(extensions[j], status);
   152         }
   153         scriptsForCP.reset(USCRIPT_COMMON, status);
   154         scriptsForCP.reset(USCRIPT_INHERITED, status);
   155         switch (scriptsForCP.countMembers()) {
   156           case 0: break;
   157           case 1:
   158             // Single script, record it.
   159             fRequiredScripts->Union(scriptsForCP);
   160             break;
   161           default:
   162             if (!fRequiredScripts->intersects(scriptsForCP) 
   163                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
   164                 // If the set hasn't been added already, add it
   165                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
   166                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
   167             }
   168             break;
   169         }
   170     }
   171     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
   172     // [Kana], [Kana Hira] => [Kana]
   173     // This is relatively infrequent, so doesn't have to be optimized.
   174     // We also compute any commonalities among the alternates.
   175     if (uhash_count(fScriptSetSet) > 0) {
   176         fCommonAmongAlternates->setAll();
   177         for (int32_t it = -1;;) {
   178             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
   179             if (nextHashEl == NULL) {
   180                 break;
   181             }
   182             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
   183             // [Kana], [Kana Hira] => [Kana]
   184             if (fRequiredScripts->intersects(*next)) {
   185                 uhash_removeElement(fScriptSetSet, nextHashEl);
   186             } else {
   187                 fCommonAmongAlternates->intersect(*next);
   188                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
   189                 for (int32_t otherIt = -1;;) {
   190                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
   191                     if (otherHashEl == NULL) {
   192                         break;
   193                     }
   194                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
   195                     if (next != other && next->contains(*other)) {
   196                         uhash_removeElement(fScriptSetSet, nextHashEl);
   197                         break;
   198                     }
   199                 }
   200             }
   201         }
   202     }
   203     if (uhash_count(fScriptSetSet) == 0) {
   204         fCommonAmongAlternates->resetAll();
   205     }
   206     return *this;
   207 }
   210 const UnicodeString *IdentifierInfo::getIdentifier() const {
   211     return fIdentifier;
   212 }
   214 const ScriptSet *IdentifierInfo::getScripts() const {
   215     return fRequiredScripts;
   216 }
   218 const UHashtable *IdentifierInfo::getAlternates() const {
   219     return fScriptSetSet;
   220 }
   223 const UnicodeSet *IdentifierInfo::getNumerics() const {
   224     return fNumerics;
   225 }
   227 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
   228     return fCommonAmongAlternates;
   229 }
   231 #if !UCONFIG_NO_NORMALIZATION
   233 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
   234     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
   235         return USPOOF_UNRESTRICTIVE;
   236     }
   237     if (ASCII->containsAll(*fIdentifier)) {
   238         return USPOOF_ASCII;
   239     }
   240     // This is a bit tricky. We look at a number of factors.
   241     // The number of scripts in the text.
   242     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
   243     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
   245     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
   246     //       time it is created, in setIdentifier().
   247     int32_t cardinalityPlus = fRequiredScripts->countMembers() + 
   248             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
   249     if (cardinalityPlus < 2) {
   250         return USPOOF_HIGHLY_RESTRICTIVE;
   251     }
   252     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
   253             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
   254         return USPOOF_HIGHLY_RESTRICTIVE;
   255     }
   256     if (cardinalityPlus == 2 && 
   257             fRequiredScripts->test(USCRIPT_LATIN, status) && 
   258             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
   259         return USPOOF_MODERATELY_RESTRICTIVE;
   260     }
   261     return USPOOF_MINIMALLY_RESTRICTIVE;
   262 }
   264 #endif /* !UCONFIG_NO_NORMALIZATION */
   266 int32_t IdentifierInfo::getScriptCount() const {
   267     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
   268     int32_t count = fRequiredScripts->countMembers() +
   269             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
   270     return count;
   271 }
   275 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
   276     if (!container.contains(containee)) {
   277         return FALSE;
   278     }
   279     for (int32_t iter = -1; ;) {
   280         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
   281         if (hashEl == NULL) {
   282             break;
   283         }
   284         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
   285         if (!container.intersects(*alternatives)) {
   286             return false;
   287         }
   288     }
   289     return true;
   290 }
   292 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
   293     UVector sorted(status);
   294     if (U_FAILURE(status)) {
   295         return dest;
   296     }
   297     for (int32_t pos = -1; ;) {
   298         const UHashElement *el = uhash_nextElement(alternates, &pos);
   299         if (el == NULL) {
   300             break;
   301         }
   302         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
   303         sorted.addElement(ss, status);
   304     }
   305     sorted.sort(uhash_compareScriptSet, status);
   306     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
   307     for (int32_t i=0; i<sorted.size(); i++) {
   308         if (i>0) {
   309             dest.append(separator);
   310         }
   311         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
   312         ss->displayScripts(dest);
   313     }
   314     return dest;
   315 }
   317 U_NAMESPACE_END

mercurial