|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2012-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #include "unicode/uchar.h" |
|
11 #include "unicode/utf16.h" |
|
12 |
|
13 #include "identifier_info.h" |
|
14 #include "mutex.h" |
|
15 #include "scriptset.h" |
|
16 #include "ucln_in.h" |
|
17 #include "uvector.h" |
|
18 |
|
19 U_NAMESPACE_BEGIN |
|
20 |
|
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
22 |
|
23 static UMutex gInitMutex = U_MUTEX_INITIALIZER; |
|
24 static UBool gStaticsAreInitialized = FALSE; |
|
25 |
|
26 UnicodeSet *IdentifierInfo::ASCII; |
|
27 ScriptSet *IdentifierInfo::JAPANESE; |
|
28 ScriptSet *IdentifierInfo::CHINESE; |
|
29 ScriptSet *IdentifierInfo::KOREAN; |
|
30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; |
|
31 |
|
32 UBool IdentifierInfo::cleanup() { |
|
33 delete ASCII; |
|
34 ASCII = NULL; |
|
35 delete JAPANESE; |
|
36 JAPANESE = NULL; |
|
37 delete CHINESE; |
|
38 CHINESE = NULL; |
|
39 delete KOREAN; |
|
40 KOREAN = NULL; |
|
41 delete CONFUSABLE_WITH_LATIN; |
|
42 CONFUSABLE_WITH_LATIN = NULL; |
|
43 gStaticsAreInitialized = FALSE; |
|
44 return TRUE; |
|
45 } |
|
46 |
|
47 U_CDECL_BEGIN |
|
48 static UBool U_CALLCONV |
|
49 IdentifierInfo_cleanup(void) { |
|
50 return IdentifierInfo::cleanup(); |
|
51 } |
|
52 U_CDECL_END |
|
53 |
|
54 |
|
55 IdentifierInfo::IdentifierInfo(UErrorCode &status): |
|
56 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), |
|
57 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { |
|
58 if (U_FAILURE(status)) { |
|
59 return; |
|
60 } |
|
61 { |
|
62 Mutex lock(&gInitMutex); |
|
63 if (!gStaticsAreInitialized) { |
|
64 ASCII = new UnicodeSet(0, 0x7f); |
|
65 JAPANESE = new ScriptSet(); |
|
66 CHINESE = new ScriptSet(); |
|
67 KOREAN = new ScriptSet(); |
|
68 CONFUSABLE_WITH_LATIN = new ScriptSet(); |
|
69 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL |
|
70 || CONFUSABLE_WITH_LATIN == NULL) { |
|
71 status = U_MEMORY_ALLOCATION_ERROR; |
|
72 return; |
|
73 } |
|
74 ASCII->freeze(); |
|
75 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) |
|
76 .set(USCRIPT_KATAKANA, status); |
|
77 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); |
|
78 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); |
|
79 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) |
|
80 .set(USCRIPT_CHEROKEE, status); |
|
81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); |
|
82 gStaticsAreInitialized = TRUE; |
|
83 } |
|
84 } |
|
85 fIdentifier = new UnicodeString(); |
|
86 fRequiredScripts = new ScriptSet(); |
|
87 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); |
|
88 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); |
|
89 fCommonAmongAlternates = new ScriptSet(); |
|
90 fNumerics = new UnicodeSet(); |
|
91 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); |
|
92 |
|
93 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || |
|
94 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { |
|
95 status = U_MEMORY_ALLOCATION_ERROR; |
|
96 } |
|
97 } |
|
98 |
|
99 IdentifierInfo::~IdentifierInfo() { |
|
100 delete fIdentifier; |
|
101 delete fRequiredScripts; |
|
102 uhash_close(fScriptSetSet); |
|
103 delete fCommonAmongAlternates; |
|
104 delete fNumerics; |
|
105 delete fIdentifierProfile; |
|
106 } |
|
107 |
|
108 |
|
109 IdentifierInfo &IdentifierInfo::clear() { |
|
110 fRequiredScripts->resetAll(); |
|
111 uhash_removeAll(fScriptSetSet); |
|
112 fNumerics->clear(); |
|
113 fCommonAmongAlternates->resetAll(); |
|
114 return *this; |
|
115 } |
|
116 |
|
117 |
|
118 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { |
|
119 *fIdentifierProfile = identifierProfile; |
|
120 return *this; |
|
121 } |
|
122 |
|
123 |
|
124 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { |
|
125 return *fIdentifierProfile; |
|
126 } |
|
127 |
|
128 |
|
129 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { |
|
130 if (U_FAILURE(status)) { |
|
131 return *this; |
|
132 } |
|
133 *fIdentifier = identifier; |
|
134 clear(); |
|
135 ScriptSet scriptsForCP; |
|
136 UChar32 cp; |
|
137 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { |
|
138 cp = identifier.char32At(i); |
|
139 // Store a representative character for each kind of decimal digit |
|
140 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { |
|
141 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value |
|
142 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); |
|
143 } |
|
144 UScriptCode extensions[500]; |
|
145 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); |
|
146 if (U_FAILURE(status)) { |
|
147 return *this; |
|
148 } |
|
149 scriptsForCP.resetAll(); |
|
150 for (int32_t j=0; j<extensionsCount; j++) { |
|
151 scriptsForCP.set(extensions[j], status); |
|
152 } |
|
153 scriptsForCP.reset(USCRIPT_COMMON, status); |
|
154 scriptsForCP.reset(USCRIPT_INHERITED, status); |
|
155 switch (scriptsForCP.countMembers()) { |
|
156 case 0: break; |
|
157 case 1: |
|
158 // Single script, record it. |
|
159 fRequiredScripts->Union(scriptsForCP); |
|
160 break; |
|
161 default: |
|
162 if (!fRequiredScripts->intersects(scriptsForCP) |
|
163 && !uhash_geti(fScriptSetSet, &scriptsForCP)) { |
|
164 // If the set hasn't been added already, add it |
|
165 // (Add a copy, fScriptSetSet takes ownership of the copy.) |
|
166 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); |
|
167 } |
|
168 break; |
|
169 } |
|
170 } |
|
171 // Now make a final pass through ScriptSetSet to remove alternates that came before singles. |
|
172 // [Kana], [Kana Hira] => [Kana] |
|
173 // This is relatively infrequent, so doesn't have to be optimized. |
|
174 // We also compute any commonalities among the alternates. |
|
175 if (uhash_count(fScriptSetSet) > 0) { |
|
176 fCommonAmongAlternates->setAll(); |
|
177 for (int32_t it = -1;;) { |
|
178 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); |
|
179 if (nextHashEl == NULL) { |
|
180 break; |
|
181 } |
|
182 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); |
|
183 // [Kana], [Kana Hira] => [Kana] |
|
184 if (fRequiredScripts->intersects(*next)) { |
|
185 uhash_removeElement(fScriptSetSet, nextHashEl); |
|
186 } else { |
|
187 fCommonAmongAlternates->intersect(*next); |
|
188 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] |
|
189 for (int32_t otherIt = -1;;) { |
|
190 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); |
|
191 if (otherHashEl == NULL) { |
|
192 break; |
|
193 } |
|
194 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); |
|
195 if (next != other && next->contains(*other)) { |
|
196 uhash_removeElement(fScriptSetSet, nextHashEl); |
|
197 break; |
|
198 } |
|
199 } |
|
200 } |
|
201 } |
|
202 } |
|
203 if (uhash_count(fScriptSetSet) == 0) { |
|
204 fCommonAmongAlternates->resetAll(); |
|
205 } |
|
206 return *this; |
|
207 } |
|
208 |
|
209 |
|
210 const UnicodeString *IdentifierInfo::getIdentifier() const { |
|
211 return fIdentifier; |
|
212 } |
|
213 |
|
214 const ScriptSet *IdentifierInfo::getScripts() const { |
|
215 return fRequiredScripts; |
|
216 } |
|
217 |
|
218 const UHashtable *IdentifierInfo::getAlternates() const { |
|
219 return fScriptSetSet; |
|
220 } |
|
221 |
|
222 |
|
223 const UnicodeSet *IdentifierInfo::getNumerics() const { |
|
224 return fNumerics; |
|
225 } |
|
226 |
|
227 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { |
|
228 return fCommonAmongAlternates; |
|
229 } |
|
230 |
|
231 #if !UCONFIG_NO_NORMALIZATION |
|
232 |
|
233 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { |
|
234 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { |
|
235 return USPOOF_UNRESTRICTIVE; |
|
236 } |
|
237 if (ASCII->containsAll(*fIdentifier)) { |
|
238 return USPOOF_ASCII; |
|
239 } |
|
240 // This is a bit tricky. We look at a number of factors. |
|
241 // The number of scripts in the text. |
|
242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) |
|
243 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) |
|
244 |
|
245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the |
|
246 // time it is created, in setIdentifier(). |
|
247 int32_t cardinalityPlus = fRequiredScripts->countMembers() + |
|
248 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
|
249 if (cardinalityPlus < 2) { |
|
250 return USPOOF_HIGHLY_RESTRICTIVE; |
|
251 } |
|
252 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) |
|
253 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { |
|
254 return USPOOF_HIGHLY_RESTRICTIVE; |
|
255 } |
|
256 if (cardinalityPlus == 2 && |
|
257 fRequiredScripts->test(USCRIPT_LATIN, status) && |
|
258 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { |
|
259 return USPOOF_MODERATELY_RESTRICTIVE; |
|
260 } |
|
261 return USPOOF_MINIMALLY_RESTRICTIVE; |
|
262 } |
|
263 |
|
264 #endif /* !UCONFIG_NO_NORMALIZATION */ |
|
265 |
|
266 int32_t IdentifierInfo::getScriptCount() const { |
|
267 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. |
|
268 int32_t count = fRequiredScripts->countMembers() + |
|
269 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
|
270 return count; |
|
271 } |
|
272 |
|
273 |
|
274 |
|
275 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { |
|
276 if (!container.contains(containee)) { |
|
277 return FALSE; |
|
278 } |
|
279 for (int32_t iter = -1; ;) { |
|
280 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); |
|
281 if (hashEl == NULL) { |
|
282 break; |
|
283 } |
|
284 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); |
|
285 if (!container.intersects(*alternatives)) { |
|
286 return false; |
|
287 } |
|
288 } |
|
289 return true; |
|
290 } |
|
291 |
|
292 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { |
|
293 UVector sorted(status); |
|
294 if (U_FAILURE(status)) { |
|
295 return dest; |
|
296 } |
|
297 for (int32_t pos = -1; ;) { |
|
298 const UHashElement *el = uhash_nextElement(alternates, &pos); |
|
299 if (el == NULL) { |
|
300 break; |
|
301 } |
|
302 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); |
|
303 sorted.addElement(ss, status); |
|
304 } |
|
305 sorted.sort(uhash_compareScriptSet, status); |
|
306 UnicodeString separator = UNICODE_STRING_SIMPLE("; "); |
|
307 for (int32_t i=0; i<sorted.size(); i++) { |
|
308 if (i>0) { |
|
309 dest.append(separator); |
|
310 } |
|
311 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); |
|
312 ss->displayScripts(dest); |
|
313 } |
|
314 return dest; |
|
315 } |
|
316 |
|
317 U_NAMESPACE_END |
|
318 |