intl/icu/source/i18n/identifier_info.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2012-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #include "unicode/uchar.h"
michael@0 11 #include "unicode/utf16.h"
michael@0 12
michael@0 13 #include "identifier_info.h"
michael@0 14 #include "mutex.h"
michael@0 15 #include "scriptset.h"
michael@0 16 #include "ucln_in.h"
michael@0 17 #include "uvector.h"
michael@0 18
michael@0 19 U_NAMESPACE_BEGIN
michael@0 20
michael@0 21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 22
michael@0 23 static UMutex gInitMutex = U_MUTEX_INITIALIZER;
michael@0 24 static UBool gStaticsAreInitialized = FALSE;
michael@0 25
michael@0 26 UnicodeSet *IdentifierInfo::ASCII;
michael@0 27 ScriptSet *IdentifierInfo::JAPANESE;
michael@0 28 ScriptSet *IdentifierInfo::CHINESE;
michael@0 29 ScriptSet *IdentifierInfo::KOREAN;
michael@0 30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
michael@0 31
michael@0 32 UBool IdentifierInfo::cleanup() {
michael@0 33 delete ASCII;
michael@0 34 ASCII = NULL;
michael@0 35 delete JAPANESE;
michael@0 36 JAPANESE = NULL;
michael@0 37 delete CHINESE;
michael@0 38 CHINESE = NULL;
michael@0 39 delete KOREAN;
michael@0 40 KOREAN = NULL;
michael@0 41 delete CONFUSABLE_WITH_LATIN;
michael@0 42 CONFUSABLE_WITH_LATIN = NULL;
michael@0 43 gStaticsAreInitialized = FALSE;
michael@0 44 return TRUE;
michael@0 45 }
michael@0 46
michael@0 47 U_CDECL_BEGIN
michael@0 48 static UBool U_CALLCONV
michael@0 49 IdentifierInfo_cleanup(void) {
michael@0 50 return IdentifierInfo::cleanup();
michael@0 51 }
michael@0 52 U_CDECL_END
michael@0 53
michael@0 54
michael@0 55 IdentifierInfo::IdentifierInfo(UErrorCode &status):
michael@0 56 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
michael@0 57 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
michael@0 58 if (U_FAILURE(status)) {
michael@0 59 return;
michael@0 60 }
michael@0 61 {
michael@0 62 Mutex lock(&gInitMutex);
michael@0 63 if (!gStaticsAreInitialized) {
michael@0 64 ASCII = new UnicodeSet(0, 0x7f);
michael@0 65 JAPANESE = new ScriptSet();
michael@0 66 CHINESE = new ScriptSet();
michael@0 67 KOREAN = new ScriptSet();
michael@0 68 CONFUSABLE_WITH_LATIN = new ScriptSet();
michael@0 69 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
michael@0 70 || CONFUSABLE_WITH_LATIN == NULL) {
michael@0 71 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 72 return;
michael@0 73 }
michael@0 74 ASCII->freeze();
michael@0 75 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
michael@0 76 .set(USCRIPT_KATAKANA, status);
michael@0 77 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
michael@0 78 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
michael@0 79 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
michael@0 80 .set(USCRIPT_CHEROKEE, status);
michael@0 81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
michael@0 82 gStaticsAreInitialized = TRUE;
michael@0 83 }
michael@0 84 }
michael@0 85 fIdentifier = new UnicodeString();
michael@0 86 fRequiredScripts = new ScriptSet();
michael@0 87 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
michael@0 88 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
michael@0 89 fCommonAmongAlternates = new ScriptSet();
michael@0 90 fNumerics = new UnicodeSet();
michael@0 91 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
michael@0 92
michael@0 93 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
michael@0 94 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
michael@0 95 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 96 }
michael@0 97 }
michael@0 98
michael@0 99 IdentifierInfo::~IdentifierInfo() {
michael@0 100 delete fIdentifier;
michael@0 101 delete fRequiredScripts;
michael@0 102 uhash_close(fScriptSetSet);
michael@0 103 delete fCommonAmongAlternates;
michael@0 104 delete fNumerics;
michael@0 105 delete fIdentifierProfile;
michael@0 106 }
michael@0 107
michael@0 108
michael@0 109 IdentifierInfo &IdentifierInfo::clear() {
michael@0 110 fRequiredScripts->resetAll();
michael@0 111 uhash_removeAll(fScriptSetSet);
michael@0 112 fNumerics->clear();
michael@0 113 fCommonAmongAlternates->resetAll();
michael@0 114 return *this;
michael@0 115 }
michael@0 116
michael@0 117
michael@0 118 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
michael@0 119 *fIdentifierProfile = identifierProfile;
michael@0 120 return *this;
michael@0 121 }
michael@0 122
michael@0 123
michael@0 124 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
michael@0 125 return *fIdentifierProfile;
michael@0 126 }
michael@0 127
michael@0 128
michael@0 129 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
michael@0 130 if (U_FAILURE(status)) {
michael@0 131 return *this;
michael@0 132 }
michael@0 133 *fIdentifier = identifier;
michael@0 134 clear();
michael@0 135 ScriptSet scriptsForCP;
michael@0 136 UChar32 cp;
michael@0 137 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
michael@0 138 cp = identifier.char32At(i);
michael@0 139 // Store a representative character for each kind of decimal digit
michael@0 140 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
michael@0 141 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
michael@0 142 fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
michael@0 143 }
michael@0 144 UScriptCode extensions[500];
michael@0 145 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
michael@0 146 if (U_FAILURE(status)) {
michael@0 147 return *this;
michael@0 148 }
michael@0 149 scriptsForCP.resetAll();
michael@0 150 for (int32_t j=0; j<extensionsCount; j++) {
michael@0 151 scriptsForCP.set(extensions[j], status);
michael@0 152 }
michael@0 153 scriptsForCP.reset(USCRIPT_COMMON, status);
michael@0 154 scriptsForCP.reset(USCRIPT_INHERITED, status);
michael@0 155 switch (scriptsForCP.countMembers()) {
michael@0 156 case 0: break;
michael@0 157 case 1:
michael@0 158 // Single script, record it.
michael@0 159 fRequiredScripts->Union(scriptsForCP);
michael@0 160 break;
michael@0 161 default:
michael@0 162 if (!fRequiredScripts->intersects(scriptsForCP)
michael@0 163 && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
michael@0 164 // If the set hasn't been added already, add it
michael@0 165 // (Add a copy, fScriptSetSet takes ownership of the copy.)
michael@0 166 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
michael@0 167 }
michael@0 168 break;
michael@0 169 }
michael@0 170 }
michael@0 171 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
michael@0 172 // [Kana], [Kana Hira] => [Kana]
michael@0 173 // This is relatively infrequent, so doesn't have to be optimized.
michael@0 174 // We also compute any commonalities among the alternates.
michael@0 175 if (uhash_count(fScriptSetSet) > 0) {
michael@0 176 fCommonAmongAlternates->setAll();
michael@0 177 for (int32_t it = -1;;) {
michael@0 178 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
michael@0 179 if (nextHashEl == NULL) {
michael@0 180 break;
michael@0 181 }
michael@0 182 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
michael@0 183 // [Kana], [Kana Hira] => [Kana]
michael@0 184 if (fRequiredScripts->intersects(*next)) {
michael@0 185 uhash_removeElement(fScriptSetSet, nextHashEl);
michael@0 186 } else {
michael@0 187 fCommonAmongAlternates->intersect(*next);
michael@0 188 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
michael@0 189 for (int32_t otherIt = -1;;) {
michael@0 190 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
michael@0 191 if (otherHashEl == NULL) {
michael@0 192 break;
michael@0 193 }
michael@0 194 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
michael@0 195 if (next != other && next->contains(*other)) {
michael@0 196 uhash_removeElement(fScriptSetSet, nextHashEl);
michael@0 197 break;
michael@0 198 }
michael@0 199 }
michael@0 200 }
michael@0 201 }
michael@0 202 }
michael@0 203 if (uhash_count(fScriptSetSet) == 0) {
michael@0 204 fCommonAmongAlternates->resetAll();
michael@0 205 }
michael@0 206 return *this;
michael@0 207 }
michael@0 208
michael@0 209
michael@0 210 const UnicodeString *IdentifierInfo::getIdentifier() const {
michael@0 211 return fIdentifier;
michael@0 212 }
michael@0 213
michael@0 214 const ScriptSet *IdentifierInfo::getScripts() const {
michael@0 215 return fRequiredScripts;
michael@0 216 }
michael@0 217
michael@0 218 const UHashtable *IdentifierInfo::getAlternates() const {
michael@0 219 return fScriptSetSet;
michael@0 220 }
michael@0 221
michael@0 222
michael@0 223 const UnicodeSet *IdentifierInfo::getNumerics() const {
michael@0 224 return fNumerics;
michael@0 225 }
michael@0 226
michael@0 227 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
michael@0 228 return fCommonAmongAlternates;
michael@0 229 }
michael@0 230
michael@0 231 #if !UCONFIG_NO_NORMALIZATION
michael@0 232
michael@0 233 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
michael@0 234 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
michael@0 235 return USPOOF_UNRESTRICTIVE;
michael@0 236 }
michael@0 237 if (ASCII->containsAll(*fIdentifier)) {
michael@0 238 return USPOOF_ASCII;
michael@0 239 }
michael@0 240 // This is a bit tricky. We look at a number of factors.
michael@0 241 // The number of scripts in the text.
michael@0 242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
michael@0 243 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
michael@0 244
michael@0 245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
michael@0 246 // time it is created, in setIdentifier().
michael@0 247 int32_t cardinalityPlus = fRequiredScripts->countMembers() +
michael@0 248 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
michael@0 249 if (cardinalityPlus < 2) {
michael@0 250 return USPOOF_HIGHLY_RESTRICTIVE;
michael@0 251 }
michael@0 252 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
michael@0 253 || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
michael@0 254 return USPOOF_HIGHLY_RESTRICTIVE;
michael@0 255 }
michael@0 256 if (cardinalityPlus == 2 &&
michael@0 257 fRequiredScripts->test(USCRIPT_LATIN, status) &&
michael@0 258 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
michael@0 259 return USPOOF_MODERATELY_RESTRICTIVE;
michael@0 260 }
michael@0 261 return USPOOF_MINIMALLY_RESTRICTIVE;
michael@0 262 }
michael@0 263
michael@0 264 #endif /* !UCONFIG_NO_NORMALIZATION */
michael@0 265
michael@0 266 int32_t IdentifierInfo::getScriptCount() const {
michael@0 267 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
michael@0 268 int32_t count = fRequiredScripts->countMembers() +
michael@0 269 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
michael@0 270 return count;
michael@0 271 }
michael@0 272
michael@0 273
michael@0 274
michael@0 275 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
michael@0 276 if (!container.contains(containee)) {
michael@0 277 return FALSE;
michael@0 278 }
michael@0 279 for (int32_t iter = -1; ;) {
michael@0 280 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
michael@0 281 if (hashEl == NULL) {
michael@0 282 break;
michael@0 283 }
michael@0 284 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
michael@0 285 if (!container.intersects(*alternatives)) {
michael@0 286 return false;
michael@0 287 }
michael@0 288 }
michael@0 289 return true;
michael@0 290 }
michael@0 291
michael@0 292 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
michael@0 293 UVector sorted(status);
michael@0 294 if (U_FAILURE(status)) {
michael@0 295 return dest;
michael@0 296 }
michael@0 297 for (int32_t pos = -1; ;) {
michael@0 298 const UHashElement *el = uhash_nextElement(alternates, &pos);
michael@0 299 if (el == NULL) {
michael@0 300 break;
michael@0 301 }
michael@0 302 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
michael@0 303 sorted.addElement(ss, status);
michael@0 304 }
michael@0 305 sorted.sort(uhash_compareScriptSet, status);
michael@0 306 UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
michael@0 307 for (int32_t i=0; i<sorted.size(); i++) {
michael@0 308 if (i>0) {
michael@0 309 dest.append(separator);
michael@0 310 }
michael@0 311 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
michael@0 312 ss->displayScripts(dest);
michael@0 313 }
michael@0 314 return dest;
michael@0 315 }
michael@0 316
michael@0 317 U_NAMESPACE_END
michael@0 318

mercurial