intl/icu/source/i18n/identifier_info.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/identifier_info.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,318 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2012-2013, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*/
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#include "unicode/uchar.h"
    1.14 +#include "unicode/utf16.h"
    1.15 +
    1.16 +#include "identifier_info.h"
    1.17 +#include "mutex.h"
    1.18 +#include "scriptset.h"
    1.19 +#include "ucln_in.h"
    1.20 +#include "uvector.h"
    1.21 +
    1.22 +U_NAMESPACE_BEGIN
    1.23 +
    1.24 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.25 +
    1.26 +static UMutex gInitMutex = U_MUTEX_INITIALIZER;
    1.27 +static UBool gStaticsAreInitialized = FALSE;
    1.28 +
    1.29 +UnicodeSet *IdentifierInfo::ASCII;
    1.30 +ScriptSet *IdentifierInfo::JAPANESE;
    1.31 +ScriptSet *IdentifierInfo::CHINESE;
    1.32 +ScriptSet *IdentifierInfo::KOREAN;
    1.33 +ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
    1.34 +
    1.35 +UBool IdentifierInfo::cleanup() {
    1.36 +    delete ASCII;
    1.37 +    ASCII = NULL;
    1.38 +    delete JAPANESE;
    1.39 +    JAPANESE = NULL;
    1.40 +    delete CHINESE;
    1.41 +    CHINESE = NULL;
    1.42 +    delete KOREAN;
    1.43 +    KOREAN = NULL;
    1.44 +    delete CONFUSABLE_WITH_LATIN;
    1.45 +    CONFUSABLE_WITH_LATIN = NULL;
    1.46 +    gStaticsAreInitialized = FALSE;
    1.47 +    return TRUE;
    1.48 +}
    1.49 +
    1.50 +U_CDECL_BEGIN
    1.51 +static UBool U_CALLCONV
    1.52 +IdentifierInfo_cleanup(void) {
    1.53 +    return IdentifierInfo::cleanup();
    1.54 +}
    1.55 +U_CDECL_END
    1.56 +
    1.57 +
    1.58 +IdentifierInfo::IdentifierInfo(UErrorCode &status):
    1.59 +         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 
    1.60 +         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
    1.61 +    if (U_FAILURE(status)) {
    1.62 +        return;
    1.63 +    }
    1.64 +    {
    1.65 +        Mutex lock(&gInitMutex);
    1.66 +        if (!gStaticsAreInitialized) {
    1.67 +            ASCII    = new UnicodeSet(0, 0x7f);
    1.68 +            JAPANESE = new ScriptSet();
    1.69 +            CHINESE  = new ScriptSet();
    1.70 +            KOREAN   = new ScriptSet();
    1.71 +            CONFUSABLE_WITH_LATIN = new ScriptSet();
    1.72 +            if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 
    1.73 +                    || CONFUSABLE_WITH_LATIN == NULL) {
    1.74 +                status = U_MEMORY_ALLOCATION_ERROR;
    1.75 +                return;
    1.76 +            }
    1.77 +            ASCII->freeze();
    1.78 +            JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
    1.79 +                     .set(USCRIPT_KATAKANA, status);
    1.80 +            CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
    1.81 +            KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
    1.82 +            CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
    1.83 +                      .set(USCRIPT_CHEROKEE, status);
    1.84 +            ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
    1.85 +            gStaticsAreInitialized = TRUE;
    1.86 +        }
    1.87 +    }
    1.88 +    fIdentifier = new UnicodeString();
    1.89 +    fRequiredScripts = new ScriptSet();
    1.90 +    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
    1.91 +    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
    1.92 +    fCommonAmongAlternates = new ScriptSet();
    1.93 +    fNumerics = new UnicodeSet();
    1.94 +    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
    1.95 +
    1.96 +    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
    1.97 +                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
    1.98 +        status = U_MEMORY_ALLOCATION_ERROR;
    1.99 +    }
   1.100 +}
   1.101 +
   1.102 +IdentifierInfo::~IdentifierInfo() {
   1.103 +    delete fIdentifier;
   1.104 +    delete fRequiredScripts;
   1.105 +    uhash_close(fScriptSetSet);
   1.106 +    delete fCommonAmongAlternates;
   1.107 +    delete fNumerics;
   1.108 +    delete fIdentifierProfile;
   1.109 +}
   1.110 +
   1.111 +
   1.112 +IdentifierInfo &IdentifierInfo::clear() {
   1.113 +    fRequiredScripts->resetAll();
   1.114 +    uhash_removeAll(fScriptSetSet);
   1.115 +    fNumerics->clear();
   1.116 +    fCommonAmongAlternates->resetAll();
   1.117 +    return *this;
   1.118 +}
   1.119 +
   1.120 +
   1.121 +IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
   1.122 +    *fIdentifierProfile = identifierProfile;
   1.123 +    return *this;
   1.124 +}
   1.125 +
   1.126 +
   1.127 +const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
   1.128 +    return *fIdentifierProfile;
   1.129 +}
   1.130 +
   1.131 +
   1.132 +IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
   1.133 +    if (U_FAILURE(status)) {
   1.134 +        return *this;
   1.135 +    }
   1.136 +    *fIdentifier = identifier;
   1.137 +    clear();
   1.138 +    ScriptSet scriptsForCP;
   1.139 +    UChar32 cp;
   1.140 +    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
   1.141 +        cp = identifier.char32At(i);
   1.142 +        // Store a representative character for each kind of decimal digit
   1.143 +        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
   1.144 +            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
   1.145 +            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
   1.146 +        }
   1.147 +        UScriptCode extensions[500];
   1.148 +        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
   1.149 +        if (U_FAILURE(status)) {
   1.150 +            return *this;
   1.151 +        }
   1.152 +        scriptsForCP.resetAll();
   1.153 +        for (int32_t j=0; j<extensionsCount; j++) {
   1.154 +            scriptsForCP.set(extensions[j], status);
   1.155 +        }
   1.156 +        scriptsForCP.reset(USCRIPT_COMMON, status);
   1.157 +        scriptsForCP.reset(USCRIPT_INHERITED, status);
   1.158 +        switch (scriptsForCP.countMembers()) {
   1.159 +          case 0: break;
   1.160 +          case 1:
   1.161 +            // Single script, record it.
   1.162 +            fRequiredScripts->Union(scriptsForCP);
   1.163 +            break;
   1.164 +          default:
   1.165 +            if (!fRequiredScripts->intersects(scriptsForCP) 
   1.166 +                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
   1.167 +                // If the set hasn't been added already, add it
   1.168 +                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
   1.169 +                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
   1.170 +            }
   1.171 +            break;
   1.172 +        }
   1.173 +    }
   1.174 +    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
   1.175 +    // [Kana], [Kana Hira] => [Kana]
   1.176 +    // This is relatively infrequent, so doesn't have to be optimized.
   1.177 +    // We also compute any commonalities among the alternates.
   1.178 +    if (uhash_count(fScriptSetSet) > 0) {
   1.179 +        fCommonAmongAlternates->setAll();
   1.180 +        for (int32_t it = -1;;) {
   1.181 +            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
   1.182 +            if (nextHashEl == NULL) {
   1.183 +                break;
   1.184 +            }
   1.185 +            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
   1.186 +            // [Kana], [Kana Hira] => [Kana]
   1.187 +            if (fRequiredScripts->intersects(*next)) {
   1.188 +                uhash_removeElement(fScriptSetSet, nextHashEl);
   1.189 +            } else {
   1.190 +                fCommonAmongAlternates->intersect(*next);
   1.191 +                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
   1.192 +                for (int32_t otherIt = -1;;) {
   1.193 +                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
   1.194 +                    if (otherHashEl == NULL) {
   1.195 +                        break;
   1.196 +                    }
   1.197 +                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
   1.198 +                    if (next != other && next->contains(*other)) {
   1.199 +                        uhash_removeElement(fScriptSetSet, nextHashEl);
   1.200 +                        break;
   1.201 +                    }
   1.202 +                }
   1.203 +            }
   1.204 +        }
   1.205 +    }
   1.206 +    if (uhash_count(fScriptSetSet) == 0) {
   1.207 +        fCommonAmongAlternates->resetAll();
   1.208 +    }
   1.209 +    return *this;
   1.210 +}
   1.211 +
   1.212 +
   1.213 +const UnicodeString *IdentifierInfo::getIdentifier() const {
   1.214 +    return fIdentifier;
   1.215 +}
   1.216 +
   1.217 +const ScriptSet *IdentifierInfo::getScripts() const {
   1.218 +    return fRequiredScripts;
   1.219 +}
   1.220 +
   1.221 +const UHashtable *IdentifierInfo::getAlternates() const {
   1.222 +    return fScriptSetSet;
   1.223 +}
   1.224 +
   1.225 +
   1.226 +const UnicodeSet *IdentifierInfo::getNumerics() const {
   1.227 +    return fNumerics;
   1.228 +}
   1.229 +
   1.230 +const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
   1.231 +    return fCommonAmongAlternates;
   1.232 +}
   1.233 +
   1.234 +#if !UCONFIG_NO_NORMALIZATION
   1.235 +
   1.236 +URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
   1.237 +    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
   1.238 +        return USPOOF_UNRESTRICTIVE;
   1.239 +    }
   1.240 +    if (ASCII->containsAll(*fIdentifier)) {
   1.241 +        return USPOOF_ASCII;
   1.242 +    }
   1.243 +    // This is a bit tricky. We look at a number of factors.
   1.244 +    // The number of scripts in the text.
   1.245 +    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
   1.246 +    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
   1.247 +
   1.248 +    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
   1.249 +    //       time it is created, in setIdentifier().
   1.250 +    int32_t cardinalityPlus = fRequiredScripts->countMembers() + 
   1.251 +            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
   1.252 +    if (cardinalityPlus < 2) {
   1.253 +        return USPOOF_HIGHLY_RESTRICTIVE;
   1.254 +    }
   1.255 +    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
   1.256 +            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
   1.257 +        return USPOOF_HIGHLY_RESTRICTIVE;
   1.258 +    }
   1.259 +    if (cardinalityPlus == 2 && 
   1.260 +            fRequiredScripts->test(USCRIPT_LATIN, status) && 
   1.261 +            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
   1.262 +        return USPOOF_MODERATELY_RESTRICTIVE;
   1.263 +    }
   1.264 +    return USPOOF_MINIMALLY_RESTRICTIVE;
   1.265 +}
   1.266 +
   1.267 +#endif /* !UCONFIG_NO_NORMALIZATION */
   1.268 +
   1.269 +int32_t IdentifierInfo::getScriptCount() const {
   1.270 +    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
   1.271 +    int32_t count = fRequiredScripts->countMembers() +
   1.272 +            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
   1.273 +    return count;
   1.274 +}
   1.275 +    
   1.276 +
   1.277 +
   1.278 +UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
   1.279 +    if (!container.contains(containee)) {
   1.280 +        return FALSE;
   1.281 +    }
   1.282 +    for (int32_t iter = -1; ;) {
   1.283 +        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
   1.284 +        if (hashEl == NULL) {
   1.285 +            break;
   1.286 +        }
   1.287 +        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
   1.288 +        if (!container.intersects(*alternatives)) {
   1.289 +            return false;
   1.290 +        }
   1.291 +    }
   1.292 +    return true;
   1.293 +}
   1.294 +
   1.295 +UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
   1.296 +    UVector sorted(status);
   1.297 +    if (U_FAILURE(status)) {
   1.298 +        return dest;
   1.299 +    }
   1.300 +    for (int32_t pos = -1; ;) {
   1.301 +        const UHashElement *el = uhash_nextElement(alternates, &pos);
   1.302 +        if (el == NULL) {
   1.303 +            break;
   1.304 +        }
   1.305 +        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
   1.306 +        sorted.addElement(ss, status);
   1.307 +    }
   1.308 +    sorted.sort(uhash_compareScriptSet, status);
   1.309 +    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
   1.310 +    for (int32_t i=0; i<sorted.size(); i++) {
   1.311 +        if (i>0) {
   1.312 +            dest.append(separator);
   1.313 +        }
   1.314 +        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
   1.315 +        ss->displayScripts(dest);
   1.316 +    }
   1.317 +    return dest;
   1.318 +}
   1.319 +
   1.320 +U_NAMESPACE_END
   1.321 +

mercurial