1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/identifier_info.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,318 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2012-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +*/ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#include "unicode/uchar.h" 1.14 +#include "unicode/utf16.h" 1.15 + 1.16 +#include "identifier_info.h" 1.17 +#include "mutex.h" 1.18 +#include "scriptset.h" 1.19 +#include "ucln_in.h" 1.20 +#include "uvector.h" 1.21 + 1.22 +U_NAMESPACE_BEGIN 1.23 + 1.24 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.25 + 1.26 +static UMutex gInitMutex = U_MUTEX_INITIALIZER; 1.27 +static UBool gStaticsAreInitialized = FALSE; 1.28 + 1.29 +UnicodeSet *IdentifierInfo::ASCII; 1.30 +ScriptSet *IdentifierInfo::JAPANESE; 1.31 +ScriptSet *IdentifierInfo::CHINESE; 1.32 +ScriptSet *IdentifierInfo::KOREAN; 1.33 +ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; 1.34 + 1.35 +UBool IdentifierInfo::cleanup() { 1.36 + delete ASCII; 1.37 + ASCII = NULL; 1.38 + delete JAPANESE; 1.39 + JAPANESE = NULL; 1.40 + delete CHINESE; 1.41 + CHINESE = NULL; 1.42 + delete KOREAN; 1.43 + KOREAN = NULL; 1.44 + delete CONFUSABLE_WITH_LATIN; 1.45 + CONFUSABLE_WITH_LATIN = NULL; 1.46 + gStaticsAreInitialized = FALSE; 1.47 + return TRUE; 1.48 +} 1.49 + 1.50 +U_CDECL_BEGIN 1.51 +static UBool U_CALLCONV 1.52 +IdentifierInfo_cleanup(void) { 1.53 + return IdentifierInfo::cleanup(); 1.54 +} 1.55 +U_CDECL_END 1.56 + 1.57 + 1.58 +IdentifierInfo::IdentifierInfo(UErrorCode &status): 1.59 + fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 1.60 + fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { 1.61 + if (U_FAILURE(status)) { 1.62 + return; 1.63 + } 1.64 + { 1.65 + Mutex lock(&gInitMutex); 1.66 + if (!gStaticsAreInitialized) { 1.67 + ASCII = new UnicodeSet(0, 0x7f); 1.68 + JAPANESE = new ScriptSet(); 1.69 + CHINESE = new ScriptSet(); 1.70 + KOREAN = new ScriptSet(); 1.71 + CONFUSABLE_WITH_LATIN = new ScriptSet(); 1.72 + if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 1.73 + || CONFUSABLE_WITH_LATIN == NULL) { 1.74 + status = U_MEMORY_ALLOCATION_ERROR; 1.75 + return; 1.76 + } 1.77 + ASCII->freeze(); 1.78 + JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) 1.79 + .set(USCRIPT_KATAKANA, status); 1.80 + CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); 1.81 + KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); 1.82 + CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) 1.83 + .set(USCRIPT_CHEROKEE, status); 1.84 + ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); 1.85 + gStaticsAreInitialized = TRUE; 1.86 + } 1.87 + } 1.88 + fIdentifier = new UnicodeString(); 1.89 + fRequiredScripts = new ScriptSet(); 1.90 + fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); 1.91 + uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); 1.92 + fCommonAmongAlternates = new ScriptSet(); 1.93 + fNumerics = new UnicodeSet(); 1.94 + fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); 1.95 + 1.96 + if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || 1.97 + fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { 1.98 + status = U_MEMORY_ALLOCATION_ERROR; 1.99 + } 1.100 +} 1.101 + 1.102 +IdentifierInfo::~IdentifierInfo() { 1.103 + delete fIdentifier; 1.104 + delete fRequiredScripts; 1.105 + uhash_close(fScriptSetSet); 1.106 + delete fCommonAmongAlternates; 1.107 + delete fNumerics; 1.108 + delete fIdentifierProfile; 1.109 +} 1.110 + 1.111 + 1.112 +IdentifierInfo &IdentifierInfo::clear() { 1.113 + fRequiredScripts->resetAll(); 1.114 + uhash_removeAll(fScriptSetSet); 1.115 + fNumerics->clear(); 1.116 + fCommonAmongAlternates->resetAll(); 1.117 + return *this; 1.118 +} 1.119 + 1.120 + 1.121 +IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { 1.122 + *fIdentifierProfile = identifierProfile; 1.123 + return *this; 1.124 +} 1.125 + 1.126 + 1.127 +const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { 1.128 + return *fIdentifierProfile; 1.129 +} 1.130 + 1.131 + 1.132 +IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { 1.133 + if (U_FAILURE(status)) { 1.134 + return *this; 1.135 + } 1.136 + *fIdentifier = identifier; 1.137 + clear(); 1.138 + ScriptSet scriptsForCP; 1.139 + UChar32 cp; 1.140 + for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { 1.141 + cp = identifier.char32At(i); 1.142 + // Store a representative character for each kind of decimal digit 1.143 + if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { 1.144 + // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value 1.145 + fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); 1.146 + } 1.147 + UScriptCode extensions[500]; 1.148 + int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); 1.149 + if (U_FAILURE(status)) { 1.150 + return *this; 1.151 + } 1.152 + scriptsForCP.resetAll(); 1.153 + for (int32_t j=0; j<extensionsCount; j++) { 1.154 + scriptsForCP.set(extensions[j], status); 1.155 + } 1.156 + scriptsForCP.reset(USCRIPT_COMMON, status); 1.157 + scriptsForCP.reset(USCRIPT_INHERITED, status); 1.158 + switch (scriptsForCP.countMembers()) { 1.159 + case 0: break; 1.160 + case 1: 1.161 + // Single script, record it. 1.162 + fRequiredScripts->Union(scriptsForCP); 1.163 + break; 1.164 + default: 1.165 + if (!fRequiredScripts->intersects(scriptsForCP) 1.166 + && !uhash_geti(fScriptSetSet, &scriptsForCP)) { 1.167 + // If the set hasn't been added already, add it 1.168 + // (Add a copy, fScriptSetSet takes ownership of the copy.) 1.169 + uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); 1.170 + } 1.171 + break; 1.172 + } 1.173 + } 1.174 + // Now make a final pass through ScriptSetSet to remove alternates that came before singles. 1.175 + // [Kana], [Kana Hira] => [Kana] 1.176 + // This is relatively infrequent, so doesn't have to be optimized. 1.177 + // We also compute any commonalities among the alternates. 1.178 + if (uhash_count(fScriptSetSet) > 0) { 1.179 + fCommonAmongAlternates->setAll(); 1.180 + for (int32_t it = -1;;) { 1.181 + const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); 1.182 + if (nextHashEl == NULL) { 1.183 + break; 1.184 + } 1.185 + ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); 1.186 + // [Kana], [Kana Hira] => [Kana] 1.187 + if (fRequiredScripts->intersects(*next)) { 1.188 + uhash_removeElement(fScriptSetSet, nextHashEl); 1.189 + } else { 1.190 + fCommonAmongAlternates->intersect(*next); 1.191 + // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] 1.192 + for (int32_t otherIt = -1;;) { 1.193 + const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); 1.194 + if (otherHashEl == NULL) { 1.195 + break; 1.196 + } 1.197 + ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); 1.198 + if (next != other && next->contains(*other)) { 1.199 + uhash_removeElement(fScriptSetSet, nextHashEl); 1.200 + break; 1.201 + } 1.202 + } 1.203 + } 1.204 + } 1.205 + } 1.206 + if (uhash_count(fScriptSetSet) == 0) { 1.207 + fCommonAmongAlternates->resetAll(); 1.208 + } 1.209 + return *this; 1.210 +} 1.211 + 1.212 + 1.213 +const UnicodeString *IdentifierInfo::getIdentifier() const { 1.214 + return fIdentifier; 1.215 +} 1.216 + 1.217 +const ScriptSet *IdentifierInfo::getScripts() const { 1.218 + return fRequiredScripts; 1.219 +} 1.220 + 1.221 +const UHashtable *IdentifierInfo::getAlternates() const { 1.222 + return fScriptSetSet; 1.223 +} 1.224 + 1.225 + 1.226 +const UnicodeSet *IdentifierInfo::getNumerics() const { 1.227 + return fNumerics; 1.228 +} 1.229 + 1.230 +const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { 1.231 + return fCommonAmongAlternates; 1.232 +} 1.233 + 1.234 +#if !UCONFIG_NO_NORMALIZATION 1.235 + 1.236 +URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { 1.237 + if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { 1.238 + return USPOOF_UNRESTRICTIVE; 1.239 + } 1.240 + if (ASCII->containsAll(*fIdentifier)) { 1.241 + return USPOOF_ASCII; 1.242 + } 1.243 + // This is a bit tricky. We look at a number of factors. 1.244 + // The number of scripts in the text. 1.245 + // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) 1.246 + // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) 1.247 + 1.248 + // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the 1.249 + // time it is created, in setIdentifier(). 1.250 + int32_t cardinalityPlus = fRequiredScripts->countMembers() + 1.251 + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 1.252 + if (cardinalityPlus < 2) { 1.253 + return USPOOF_HIGHLY_RESTRICTIVE; 1.254 + } 1.255 + if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) 1.256 + || containsWithAlternates(*KOREAN, *fRequiredScripts)) { 1.257 + return USPOOF_HIGHLY_RESTRICTIVE; 1.258 + } 1.259 + if (cardinalityPlus == 2 && 1.260 + fRequiredScripts->test(USCRIPT_LATIN, status) && 1.261 + !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { 1.262 + return USPOOF_MODERATELY_RESTRICTIVE; 1.263 + } 1.264 + return USPOOF_MINIMALLY_RESTRICTIVE; 1.265 +} 1.266 + 1.267 +#endif /* !UCONFIG_NO_NORMALIZATION */ 1.268 + 1.269 +int32_t IdentifierInfo::getScriptCount() const { 1.270 + // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. 1.271 + int32_t count = fRequiredScripts->countMembers() + 1.272 + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 1.273 + return count; 1.274 +} 1.275 + 1.276 + 1.277 + 1.278 +UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { 1.279 + if (!container.contains(containee)) { 1.280 + return FALSE; 1.281 + } 1.282 + for (int32_t iter = -1; ;) { 1.283 + const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); 1.284 + if (hashEl == NULL) { 1.285 + break; 1.286 + } 1.287 + ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); 1.288 + if (!container.intersects(*alternatives)) { 1.289 + return false; 1.290 + } 1.291 + } 1.292 + return true; 1.293 +} 1.294 + 1.295 +UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { 1.296 + UVector sorted(status); 1.297 + if (U_FAILURE(status)) { 1.298 + return dest; 1.299 + } 1.300 + for (int32_t pos = -1; ;) { 1.301 + const UHashElement *el = uhash_nextElement(alternates, &pos); 1.302 + if (el == NULL) { 1.303 + break; 1.304 + } 1.305 + ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); 1.306 + sorted.addElement(ss, status); 1.307 + } 1.308 + sorted.sort(uhash_compareScriptSet, status); 1.309 + UnicodeString separator = UNICODE_STRING_SIMPLE("; "); 1.310 + for (int32_t i=0; i<sorted.size(); i++) { 1.311 + if (i>0) { 1.312 + dest.append(separator); 1.313 + } 1.314 + ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); 1.315 + ss->displayScripts(dest); 1.316 + } 1.317 + return dest; 1.318 +} 1.319 + 1.320 +U_NAMESPACE_END 1.321 +