Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2012-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | */ |
michael@0 | 7 | |
michael@0 | 8 | #include "unicode/utypes.h" |
michael@0 | 9 | |
michael@0 | 10 | #include "unicode/uchar.h" |
michael@0 | 11 | #include "unicode/utf16.h" |
michael@0 | 12 | |
michael@0 | 13 | #include "identifier_info.h" |
michael@0 | 14 | #include "mutex.h" |
michael@0 | 15 | #include "scriptset.h" |
michael@0 | 16 | #include "ucln_in.h" |
michael@0 | 17 | #include "uvector.h" |
michael@0 | 18 | |
michael@0 | 19 | U_NAMESPACE_BEGIN |
michael@0 | 20 | |
michael@0 | 21 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 22 | |
michael@0 | 23 | static UMutex gInitMutex = U_MUTEX_INITIALIZER; |
michael@0 | 24 | static UBool gStaticsAreInitialized = FALSE; |
michael@0 | 25 | |
michael@0 | 26 | UnicodeSet *IdentifierInfo::ASCII; |
michael@0 | 27 | ScriptSet *IdentifierInfo::JAPANESE; |
michael@0 | 28 | ScriptSet *IdentifierInfo::CHINESE; |
michael@0 | 29 | ScriptSet *IdentifierInfo::KOREAN; |
michael@0 | 30 | ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; |
michael@0 | 31 | |
michael@0 | 32 | UBool IdentifierInfo::cleanup() { |
michael@0 | 33 | delete ASCII; |
michael@0 | 34 | ASCII = NULL; |
michael@0 | 35 | delete JAPANESE; |
michael@0 | 36 | JAPANESE = NULL; |
michael@0 | 37 | delete CHINESE; |
michael@0 | 38 | CHINESE = NULL; |
michael@0 | 39 | delete KOREAN; |
michael@0 | 40 | KOREAN = NULL; |
michael@0 | 41 | delete CONFUSABLE_WITH_LATIN; |
michael@0 | 42 | CONFUSABLE_WITH_LATIN = NULL; |
michael@0 | 43 | gStaticsAreInitialized = FALSE; |
michael@0 | 44 | return TRUE; |
michael@0 | 45 | } |
michael@0 | 46 | |
michael@0 | 47 | U_CDECL_BEGIN |
michael@0 | 48 | static UBool U_CALLCONV |
michael@0 | 49 | IdentifierInfo_cleanup(void) { |
michael@0 | 50 | return IdentifierInfo::cleanup(); |
michael@0 | 51 | } |
michael@0 | 52 | U_CDECL_END |
michael@0 | 53 | |
michael@0 | 54 | |
michael@0 | 55 | IdentifierInfo::IdentifierInfo(UErrorCode &status): |
michael@0 | 56 | fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), |
michael@0 | 57 | fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { |
michael@0 | 58 | if (U_FAILURE(status)) { |
michael@0 | 59 | return; |
michael@0 | 60 | } |
michael@0 | 61 | { |
michael@0 | 62 | Mutex lock(&gInitMutex); |
michael@0 | 63 | if (!gStaticsAreInitialized) { |
michael@0 | 64 | ASCII = new UnicodeSet(0, 0x7f); |
michael@0 | 65 | JAPANESE = new ScriptSet(); |
michael@0 | 66 | CHINESE = new ScriptSet(); |
michael@0 | 67 | KOREAN = new ScriptSet(); |
michael@0 | 68 | CONFUSABLE_WITH_LATIN = new ScriptSet(); |
michael@0 | 69 | if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL |
michael@0 | 70 | || CONFUSABLE_WITH_LATIN == NULL) { |
michael@0 | 71 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 72 | return; |
michael@0 | 73 | } |
michael@0 | 74 | ASCII->freeze(); |
michael@0 | 75 | JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) |
michael@0 | 76 | .set(USCRIPT_KATAKANA, status); |
michael@0 | 77 | CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); |
michael@0 | 78 | KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); |
michael@0 | 79 | CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) |
michael@0 | 80 | .set(USCRIPT_CHEROKEE, status); |
michael@0 | 81 | ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); |
michael@0 | 82 | gStaticsAreInitialized = TRUE; |
michael@0 | 83 | } |
michael@0 | 84 | } |
michael@0 | 85 | fIdentifier = new UnicodeString(); |
michael@0 | 86 | fRequiredScripts = new ScriptSet(); |
michael@0 | 87 | fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); |
michael@0 | 88 | uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); |
michael@0 | 89 | fCommonAmongAlternates = new ScriptSet(); |
michael@0 | 90 | fNumerics = new UnicodeSet(); |
michael@0 | 91 | fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); |
michael@0 | 92 | |
michael@0 | 93 | if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || |
michael@0 | 94 | fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { |
michael@0 | 95 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 96 | } |
michael@0 | 97 | } |
michael@0 | 98 | |
michael@0 | 99 | IdentifierInfo::~IdentifierInfo() { |
michael@0 | 100 | delete fIdentifier; |
michael@0 | 101 | delete fRequiredScripts; |
michael@0 | 102 | uhash_close(fScriptSetSet); |
michael@0 | 103 | delete fCommonAmongAlternates; |
michael@0 | 104 | delete fNumerics; |
michael@0 | 105 | delete fIdentifierProfile; |
michael@0 | 106 | } |
michael@0 | 107 | |
michael@0 | 108 | |
michael@0 | 109 | IdentifierInfo &IdentifierInfo::clear() { |
michael@0 | 110 | fRequiredScripts->resetAll(); |
michael@0 | 111 | uhash_removeAll(fScriptSetSet); |
michael@0 | 112 | fNumerics->clear(); |
michael@0 | 113 | fCommonAmongAlternates->resetAll(); |
michael@0 | 114 | return *this; |
michael@0 | 115 | } |
michael@0 | 116 | |
michael@0 | 117 | |
michael@0 | 118 | IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { |
michael@0 | 119 | *fIdentifierProfile = identifierProfile; |
michael@0 | 120 | return *this; |
michael@0 | 121 | } |
michael@0 | 122 | |
michael@0 | 123 | |
michael@0 | 124 | const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { |
michael@0 | 125 | return *fIdentifierProfile; |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | |
michael@0 | 129 | IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { |
michael@0 | 130 | if (U_FAILURE(status)) { |
michael@0 | 131 | return *this; |
michael@0 | 132 | } |
michael@0 | 133 | *fIdentifier = identifier; |
michael@0 | 134 | clear(); |
michael@0 | 135 | ScriptSet scriptsForCP; |
michael@0 | 136 | UChar32 cp; |
michael@0 | 137 | for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { |
michael@0 | 138 | cp = identifier.char32At(i); |
michael@0 | 139 | // Store a representative character for each kind of decimal digit |
michael@0 | 140 | if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { |
michael@0 | 141 | // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value |
michael@0 | 142 | fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); |
michael@0 | 143 | } |
michael@0 | 144 | UScriptCode extensions[500]; |
michael@0 | 145 | int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); |
michael@0 | 146 | if (U_FAILURE(status)) { |
michael@0 | 147 | return *this; |
michael@0 | 148 | } |
michael@0 | 149 | scriptsForCP.resetAll(); |
michael@0 | 150 | for (int32_t j=0; j<extensionsCount; j++) { |
michael@0 | 151 | scriptsForCP.set(extensions[j], status); |
michael@0 | 152 | } |
michael@0 | 153 | scriptsForCP.reset(USCRIPT_COMMON, status); |
michael@0 | 154 | scriptsForCP.reset(USCRIPT_INHERITED, status); |
michael@0 | 155 | switch (scriptsForCP.countMembers()) { |
michael@0 | 156 | case 0: break; |
michael@0 | 157 | case 1: |
michael@0 | 158 | // Single script, record it. |
michael@0 | 159 | fRequiredScripts->Union(scriptsForCP); |
michael@0 | 160 | break; |
michael@0 | 161 | default: |
michael@0 | 162 | if (!fRequiredScripts->intersects(scriptsForCP) |
michael@0 | 163 | && !uhash_geti(fScriptSetSet, &scriptsForCP)) { |
michael@0 | 164 | // If the set hasn't been added already, add it |
michael@0 | 165 | // (Add a copy, fScriptSetSet takes ownership of the copy.) |
michael@0 | 166 | uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); |
michael@0 | 167 | } |
michael@0 | 168 | break; |
michael@0 | 169 | } |
michael@0 | 170 | } |
michael@0 | 171 | // Now make a final pass through ScriptSetSet to remove alternates that came before singles. |
michael@0 | 172 | // [Kana], [Kana Hira] => [Kana] |
michael@0 | 173 | // This is relatively infrequent, so doesn't have to be optimized. |
michael@0 | 174 | // We also compute any commonalities among the alternates. |
michael@0 | 175 | if (uhash_count(fScriptSetSet) > 0) { |
michael@0 | 176 | fCommonAmongAlternates->setAll(); |
michael@0 | 177 | for (int32_t it = -1;;) { |
michael@0 | 178 | const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); |
michael@0 | 179 | if (nextHashEl == NULL) { |
michael@0 | 180 | break; |
michael@0 | 181 | } |
michael@0 | 182 | ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); |
michael@0 | 183 | // [Kana], [Kana Hira] => [Kana] |
michael@0 | 184 | if (fRequiredScripts->intersects(*next)) { |
michael@0 | 185 | uhash_removeElement(fScriptSetSet, nextHashEl); |
michael@0 | 186 | } else { |
michael@0 | 187 | fCommonAmongAlternates->intersect(*next); |
michael@0 | 188 | // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] |
michael@0 | 189 | for (int32_t otherIt = -1;;) { |
michael@0 | 190 | const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); |
michael@0 | 191 | if (otherHashEl == NULL) { |
michael@0 | 192 | break; |
michael@0 | 193 | } |
michael@0 | 194 | ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); |
michael@0 | 195 | if (next != other && next->contains(*other)) { |
michael@0 | 196 | uhash_removeElement(fScriptSetSet, nextHashEl); |
michael@0 | 197 | break; |
michael@0 | 198 | } |
michael@0 | 199 | } |
michael@0 | 200 | } |
michael@0 | 201 | } |
michael@0 | 202 | } |
michael@0 | 203 | if (uhash_count(fScriptSetSet) == 0) { |
michael@0 | 204 | fCommonAmongAlternates->resetAll(); |
michael@0 | 205 | } |
michael@0 | 206 | return *this; |
michael@0 | 207 | } |
michael@0 | 208 | |
michael@0 | 209 | |
michael@0 | 210 | const UnicodeString *IdentifierInfo::getIdentifier() const { |
michael@0 | 211 | return fIdentifier; |
michael@0 | 212 | } |
michael@0 | 213 | |
michael@0 | 214 | const ScriptSet *IdentifierInfo::getScripts() const { |
michael@0 | 215 | return fRequiredScripts; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | const UHashtable *IdentifierInfo::getAlternates() const { |
michael@0 | 219 | return fScriptSetSet; |
michael@0 | 220 | } |
michael@0 | 221 | |
michael@0 | 222 | |
michael@0 | 223 | const UnicodeSet *IdentifierInfo::getNumerics() const { |
michael@0 | 224 | return fNumerics; |
michael@0 | 225 | } |
michael@0 | 226 | |
michael@0 | 227 | const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { |
michael@0 | 228 | return fCommonAmongAlternates; |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 232 | |
michael@0 | 233 | URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { |
michael@0 | 234 | if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { |
michael@0 | 235 | return USPOOF_UNRESTRICTIVE; |
michael@0 | 236 | } |
michael@0 | 237 | if (ASCII->containsAll(*fIdentifier)) { |
michael@0 | 238 | return USPOOF_ASCII; |
michael@0 | 239 | } |
michael@0 | 240 | // This is a bit tricky. We look at a number of factors. |
michael@0 | 241 | // The number of scripts in the text. |
michael@0 | 242 | // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) |
michael@0 | 243 | // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) |
michael@0 | 244 | |
michael@0 | 245 | // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the |
michael@0 | 246 | // time it is created, in setIdentifier(). |
michael@0 | 247 | int32_t cardinalityPlus = fRequiredScripts->countMembers() + |
michael@0 | 248 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
michael@0 | 249 | if (cardinalityPlus < 2) { |
michael@0 | 250 | return USPOOF_HIGHLY_RESTRICTIVE; |
michael@0 | 251 | } |
michael@0 | 252 | if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) |
michael@0 | 253 | || containsWithAlternates(*KOREAN, *fRequiredScripts)) { |
michael@0 | 254 | return USPOOF_HIGHLY_RESTRICTIVE; |
michael@0 | 255 | } |
michael@0 | 256 | if (cardinalityPlus == 2 && |
michael@0 | 257 | fRequiredScripts->test(USCRIPT_LATIN, status) && |
michael@0 | 258 | !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { |
michael@0 | 259 | return USPOOF_MODERATELY_RESTRICTIVE; |
michael@0 | 260 | } |
michael@0 | 261 | return USPOOF_MINIMALLY_RESTRICTIVE; |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | #endif /* !UCONFIG_NO_NORMALIZATION */ |
michael@0 | 265 | |
michael@0 | 266 | int32_t IdentifierInfo::getScriptCount() const { |
michael@0 | 267 | // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. |
michael@0 | 268 | int32_t count = fRequiredScripts->countMembers() + |
michael@0 | 269 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
michael@0 | 270 | return count; |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | |
michael@0 | 274 | |
michael@0 | 275 | UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { |
michael@0 | 276 | if (!container.contains(containee)) { |
michael@0 | 277 | return FALSE; |
michael@0 | 278 | } |
michael@0 | 279 | for (int32_t iter = -1; ;) { |
michael@0 | 280 | const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); |
michael@0 | 281 | if (hashEl == NULL) { |
michael@0 | 282 | break; |
michael@0 | 283 | } |
michael@0 | 284 | ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); |
michael@0 | 285 | if (!container.intersects(*alternatives)) { |
michael@0 | 286 | return false; |
michael@0 | 287 | } |
michael@0 | 288 | } |
michael@0 | 289 | return true; |
michael@0 | 290 | } |
michael@0 | 291 | |
michael@0 | 292 | UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { |
michael@0 | 293 | UVector sorted(status); |
michael@0 | 294 | if (U_FAILURE(status)) { |
michael@0 | 295 | return dest; |
michael@0 | 296 | } |
michael@0 | 297 | for (int32_t pos = -1; ;) { |
michael@0 | 298 | const UHashElement *el = uhash_nextElement(alternates, &pos); |
michael@0 | 299 | if (el == NULL) { |
michael@0 | 300 | break; |
michael@0 | 301 | } |
michael@0 | 302 | ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); |
michael@0 | 303 | sorted.addElement(ss, status); |
michael@0 | 304 | } |
michael@0 | 305 | sorted.sort(uhash_compareScriptSet, status); |
michael@0 | 306 | UnicodeString separator = UNICODE_STRING_SIMPLE("; "); |
michael@0 | 307 | for (int32_t i=0; i<sorted.size(); i++) { |
michael@0 | 308 | if (i>0) { |
michael@0 | 309 | dest.append(separator); |
michael@0 | 310 | } |
michael@0 | 311 | ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); |
michael@0 | 312 | ss->displayScripts(dest); |
michael@0 | 313 | } |
michael@0 | 314 | return dest; |
michael@0 | 315 | } |
michael@0 | 316 | |
michael@0 | 317 | U_NAMESPACE_END |
michael@0 | 318 |