|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2008-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 #include "unicode/uspoof.h" |
|
10 #include "unicode/uchar.h" |
|
11 #include "unicode/uniset.h" |
|
12 #include "unicode/utf16.h" |
|
13 #include "utrie2.h" |
|
14 #include "cmemory.h" |
|
15 #include "cstring.h" |
|
16 #include "identifier_info.h" |
|
17 #include "scriptset.h" |
|
18 #include "udatamem.h" |
|
19 #include "umutex.h" |
|
20 #include "udataswp.h" |
|
21 #include "uassert.h" |
|
22 #include "uspoof_impl.h" |
|
23 |
|
24 #if !UCONFIG_NO_NORMALIZATION |
|
25 |
|
26 |
|
27 U_NAMESPACE_BEGIN |
|
28 |
|
29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) |
|
30 |
|
31 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : |
|
32 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , |
|
33 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
|
34 if (U_FAILURE(status)) { |
|
35 return; |
|
36 } |
|
37 fSpoofData = data; |
|
38 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; |
|
39 |
|
40 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); |
|
41 allowedCharsSet->freeze(); |
|
42 fAllowedCharsSet = allowedCharsSet; |
|
43 fAllowedLocales = uprv_strdup(""); |
|
44 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { |
|
45 status = U_MEMORY_ALLOCATION_ERROR; |
|
46 return; |
|
47 } |
|
48 fMagic = USPOOF_MAGIC; |
|
49 } |
|
50 |
|
51 |
|
52 SpoofImpl::SpoofImpl() : |
|
53 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , |
|
54 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
|
55 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); |
|
56 allowedCharsSet->freeze(); |
|
57 fAllowedCharsSet = allowedCharsSet; |
|
58 fAllowedLocales = uprv_strdup(""); |
|
59 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; |
|
60 } |
|
61 |
|
62 |
|
63 // Copy Constructor, used by the user level clone() function. |
|
64 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : |
|
65 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , |
|
66 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { |
|
67 if (U_FAILURE(status)) { |
|
68 return; |
|
69 } |
|
70 fMagic = src.fMagic; |
|
71 fChecks = src.fChecks; |
|
72 if (src.fSpoofData != NULL) { |
|
73 fSpoofData = src.fSpoofData->addReference(); |
|
74 } |
|
75 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); |
|
76 if (fAllowedCharsSet == NULL) { |
|
77 status = U_MEMORY_ALLOCATION_ERROR; |
|
78 } |
|
79 fAllowedLocales = uprv_strdup(src.fAllowedLocales); |
|
80 fRestrictionLevel = src.fRestrictionLevel; |
|
81 } |
|
82 |
|
83 SpoofImpl::~SpoofImpl() { |
|
84 fMagic = 0; // head off application errors by preventing use of |
|
85 // of deleted objects. |
|
86 if (fSpoofData != NULL) { |
|
87 fSpoofData->removeReference(); // Will delete if refCount goes to zero. |
|
88 } |
|
89 delete fAllowedCharsSet; |
|
90 uprv_free((void *)fAllowedLocales); |
|
91 delete fCachedIdentifierInfo; |
|
92 } |
|
93 |
|
94 // |
|
95 // Incoming parameter check on Status and the SpoofChecker object |
|
96 // received from the C API. |
|
97 // |
|
98 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { |
|
99 if (U_FAILURE(status)) { |
|
100 return NULL; |
|
101 } |
|
102 if (sc == NULL) { |
|
103 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
104 return NULL; |
|
105 } |
|
106 SpoofImpl *This = (SpoofImpl *)sc; |
|
107 if (This->fMagic != USPOOF_MAGIC || |
|
108 This->fSpoofData == NULL) { |
|
109 status = U_INVALID_FORMAT_ERROR; |
|
110 return NULL; |
|
111 } |
|
112 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { |
|
113 return NULL; |
|
114 } |
|
115 return This; |
|
116 } |
|
117 |
|
118 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { |
|
119 return const_cast<SpoofImpl *> |
|
120 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); |
|
121 } |
|
122 |
|
123 |
|
124 |
|
125 //-------------------------------------------------------------------------------------- |
|
126 // |
|
127 // confusableLookup() This is the heart of the confusable skeleton generation |
|
128 // implementation. |
|
129 // |
|
130 // Given a source character, produce the corresponding |
|
131 // replacement character(s), appending them to the dest string. |
|
132 // |
|
133 //--------------------------------------------------------------------------------------- |
|
134 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { |
|
135 |
|
136 // Binary search the spoof data key table for the inChar |
|
137 int32_t *low = fSpoofData->fCFUKeys; |
|
138 int32_t *mid = NULL; |
|
139 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; |
|
140 UChar32 midc; |
|
141 do { |
|
142 int32_t delta = ((int32_t)(limit-low))/2; |
|
143 mid = low + delta; |
|
144 midc = *mid & 0x1fffff; |
|
145 if (inChar == midc) { |
|
146 goto foundChar; |
|
147 } else if (inChar < midc) { |
|
148 limit = mid; |
|
149 } else { |
|
150 low = mid; |
|
151 } |
|
152 } while (low < limit-1); |
|
153 mid = low; |
|
154 midc = *mid & 0x1fffff; |
|
155 if (inChar != midc) { |
|
156 // Char not found. It maps to itself. |
|
157 int i = 0; |
|
158 dest.append(inChar); |
|
159 return i; |
|
160 } |
|
161 foundChar: |
|
162 int32_t keyFlags = *mid & 0xff000000; |
|
163 if ((keyFlags & tableMask) == 0) { |
|
164 // We found the right key char, but the entry doesn't pertain to the |
|
165 // table we need. See if there is an adjacent key that does |
|
166 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { |
|
167 int32_t *altMid; |
|
168 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { |
|
169 keyFlags = *altMid & 0xff000000; |
|
170 if (keyFlags & tableMask) { |
|
171 mid = altMid; |
|
172 goto foundKey; |
|
173 } |
|
174 } |
|
175 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { |
|
176 keyFlags = *altMid & 0xff000000; |
|
177 if (keyFlags & tableMask) { |
|
178 mid = altMid; |
|
179 goto foundKey; |
|
180 } |
|
181 } |
|
182 } |
|
183 // No key entry for this char & table. |
|
184 // The input char maps to itself. |
|
185 int i = 0; |
|
186 dest.append(inChar); |
|
187 return i; |
|
188 } |
|
189 |
|
190 foundKey: |
|
191 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; |
|
192 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); |
|
193 |
|
194 // Value is either a UChar (for strings of length 1) or |
|
195 // an index into the string table (for longer strings) |
|
196 uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; |
|
197 if (stringLen == 1) { |
|
198 dest.append((UChar)value); |
|
199 return 1; |
|
200 } |
|
201 |
|
202 // String length of 4 from the above lookup is used for all strings of length >= 4. |
|
203 // For these, get the real length from the string lengths table, |
|
204 // which maps string table indexes to lengths. |
|
205 // All strings of the same length are stored contiguously in the string table. |
|
206 // 'value' from the lookup above is the starting index for the desired string. |
|
207 |
|
208 int32_t ix; |
|
209 if (stringLen == 4) { |
|
210 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; |
|
211 for (ix = 0; ix < stringLengthsLimit; ix++) { |
|
212 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { |
|
213 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; |
|
214 break; |
|
215 } |
|
216 } |
|
217 U_ASSERT(ix < stringLengthsLimit); |
|
218 } |
|
219 |
|
220 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); |
|
221 UChar *src = &fSpoofData->fCFUStrings[value]; |
|
222 dest.append(src, stringLen); |
|
223 return stringLen; |
|
224 } |
|
225 |
|
226 |
|
227 //--------------------------------------------------------------------------------------- |
|
228 // |
|
229 // wholeScriptCheck() |
|
230 // |
|
231 // Input text is already normalized to NFD |
|
232 // Return the set of scripts, each of which can represent something that is |
|
233 // confusable with the input text. The script of the input text |
|
234 // is included; input consisting of characters from a single script will |
|
235 // always produce a result consisting of a set containing that script. |
|
236 // |
|
237 //--------------------------------------------------------------------------------------- |
|
238 void SpoofImpl::wholeScriptCheck( |
|
239 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { |
|
240 |
|
241 UTrie2 *table = |
|
242 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; |
|
243 result->setAll(); |
|
244 int32_t length = text.length(); |
|
245 for (int32_t inputIdx=0; inputIdx < length;) { |
|
246 UChar32 c = text.char32At(inputIdx); |
|
247 inputIdx += U16_LENGTH(c); |
|
248 uint32_t index = utrie2_get32(table, c); |
|
249 if (index == 0) { |
|
250 // No confusables in another script for this char. |
|
251 // TODO: we should change the data to have sets with just the single script |
|
252 // bit for the script of this char. Gets rid of this special case. |
|
253 // Until then, grab the script from the char and intersect it with the set. |
|
254 UScriptCode cpScript = uscript_getScript(c, &status); |
|
255 U_ASSERT(cpScript > USCRIPT_INHERITED); |
|
256 result->intersect(cpScript, status); |
|
257 } else if (index == 1) { |
|
258 // Script == Common or Inherited. Nothing to do. |
|
259 } else { |
|
260 result->intersect(fSpoofData->fScriptSets[index]); |
|
261 } |
|
262 } |
|
263 } |
|
264 |
|
265 |
|
266 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { |
|
267 UnicodeSet allowedChars; |
|
268 UnicodeSet *tmpSet = NULL; |
|
269 const char *locStart = localesList; |
|
270 const char *locEnd = NULL; |
|
271 const char *localesListEnd = localesList + uprv_strlen(localesList); |
|
272 int32_t localeListCount = 0; // Number of locales provided by caller. |
|
273 |
|
274 // Loop runs once per locale from the localesList, a comma separated list of locales. |
|
275 do { |
|
276 locEnd = uprv_strchr(locStart, ','); |
|
277 if (locEnd == NULL) { |
|
278 locEnd = localesListEnd; |
|
279 } |
|
280 while (*locStart == ' ') { |
|
281 locStart++; |
|
282 } |
|
283 const char *trimmedEnd = locEnd-1; |
|
284 while (trimmedEnd > locStart && *trimmedEnd == ' ') { |
|
285 trimmedEnd--; |
|
286 } |
|
287 if (trimmedEnd <= locStart) { |
|
288 break; |
|
289 } |
|
290 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); |
|
291 localeListCount++; |
|
292 |
|
293 // We have one locale from the locales list. |
|
294 // Add the script chars for this locale to the accumulating set of allowed chars. |
|
295 // If the locale is no good, we will be notified back via status. |
|
296 addScriptChars(locale, &allowedChars, status); |
|
297 uprv_free((void *)locale); |
|
298 if (U_FAILURE(status)) { |
|
299 break; |
|
300 } |
|
301 locStart = locEnd + 1; |
|
302 } while (locStart < localesListEnd); |
|
303 |
|
304 // If our caller provided an empty list of locales, we disable the allowed characters checking |
|
305 if (localeListCount == 0) { |
|
306 uprv_free((void *)fAllowedLocales); |
|
307 fAllowedLocales = uprv_strdup(""); |
|
308 tmpSet = new UnicodeSet(0, 0x10ffff); |
|
309 if (fAllowedLocales == NULL || tmpSet == NULL) { |
|
310 status = U_MEMORY_ALLOCATION_ERROR; |
|
311 return; |
|
312 } |
|
313 tmpSet->freeze(); |
|
314 delete fAllowedCharsSet; |
|
315 fAllowedCharsSet = tmpSet; |
|
316 fChecks &= ~USPOOF_CHAR_LIMIT; |
|
317 return; |
|
318 } |
|
319 |
|
320 |
|
321 // Add all common and inherited characters to the set of allowed chars. |
|
322 UnicodeSet tempSet; |
|
323 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); |
|
324 allowedChars.addAll(tempSet); |
|
325 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); |
|
326 allowedChars.addAll(tempSet); |
|
327 |
|
328 // If anything went wrong, we bail out without changing |
|
329 // the state of the spoof checker. |
|
330 if (U_FAILURE(status)) { |
|
331 return; |
|
332 } |
|
333 |
|
334 // Store the updated spoof checker state. |
|
335 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); |
|
336 const char *tmpLocalesList = uprv_strdup(localesList); |
|
337 if (tmpSet == NULL || tmpLocalesList == NULL) { |
|
338 status = U_MEMORY_ALLOCATION_ERROR; |
|
339 return; |
|
340 } |
|
341 uprv_free((void *)fAllowedLocales); |
|
342 fAllowedLocales = tmpLocalesList; |
|
343 tmpSet->freeze(); |
|
344 delete fAllowedCharsSet; |
|
345 fAllowedCharsSet = tmpSet; |
|
346 fChecks |= USPOOF_CHAR_LIMIT; |
|
347 } |
|
348 |
|
349 |
|
350 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { |
|
351 return fAllowedLocales; |
|
352 } |
|
353 |
|
354 |
|
355 // Given a locale (a language), add all the characters from all of the scripts used with that language |
|
356 // to the allowedChars UnicodeSet |
|
357 |
|
358 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { |
|
359 UScriptCode scripts[30]; |
|
360 |
|
361 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); |
|
362 if (U_FAILURE(status)) { |
|
363 return; |
|
364 } |
|
365 if (status == U_USING_DEFAULT_WARNING) { |
|
366 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
367 return; |
|
368 } |
|
369 UnicodeSet tmpSet; |
|
370 int32_t i; |
|
371 for (i=0; i<numScripts; i++) { |
|
372 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); |
|
373 allowedChars->addAll(tmpSet); |
|
374 } |
|
375 } |
|
376 |
|
377 |
|
378 // Convert a text format hex number. Utility function used by builder code. Static. |
|
379 // Input: UChar *string text. Output: a UChar32 |
|
380 // Input has been pre-checked, and will have no non-hex chars. |
|
381 // The number must fall in the code point range of 0..0x10ffff |
|
382 // Static Function. |
|
383 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { |
|
384 if (U_FAILURE(status)) { |
|
385 return 0; |
|
386 } |
|
387 U_ASSERT(limit-start > 0); |
|
388 uint32_t val = 0; |
|
389 int i; |
|
390 for (i=start; i<limit; i++) { |
|
391 int digitVal = s[i] - 0x30; |
|
392 if (digitVal>9) { |
|
393 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' |
|
394 } |
|
395 if (digitVal>15) { |
|
396 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' |
|
397 } |
|
398 U_ASSERT(digitVal <= 0xf); |
|
399 val <<= 4; |
|
400 val += digitVal; |
|
401 } |
|
402 if (val > 0x10ffff) { |
|
403 status = U_PARSE_ERROR; |
|
404 val = 0; |
|
405 } |
|
406 return (UChar32)val; |
|
407 } |
|
408 |
|
409 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. |
|
410 // Maintain a one-element cache, which is sufficient to avoid repeatedly |
|
411 // creating new ones unless we get multi-thread concurrency in spoof |
|
412 // check operations, which should be statistically uncommon. |
|
413 |
|
414 // These functions are used in place of new & delete of an IdentifierInfo. |
|
415 // They will recycle the IdentifierInfo when possible. |
|
416 // They are logically const, and used within const functions that must be thread safe. |
|
417 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { |
|
418 IdentifierInfo *returnIdInfo = NULL; |
|
419 if (U_FAILURE(status)) { |
|
420 return returnIdInfo; |
|
421 } |
|
422 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); |
|
423 { |
|
424 Mutex m; |
|
425 returnIdInfo = nonConstThis->fCachedIdentifierInfo; |
|
426 nonConstThis->fCachedIdentifierInfo = NULL; |
|
427 } |
|
428 if (returnIdInfo == NULL) { |
|
429 returnIdInfo = new IdentifierInfo(status); |
|
430 if (U_SUCCESS(status) && returnIdInfo == NULL) { |
|
431 status = U_MEMORY_ALLOCATION_ERROR; |
|
432 } |
|
433 if (U_FAILURE(status) && returnIdInfo != NULL) { |
|
434 delete returnIdInfo; |
|
435 returnIdInfo = NULL; |
|
436 } |
|
437 } |
|
438 return returnIdInfo; |
|
439 } |
|
440 |
|
441 |
|
442 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { |
|
443 if (idInfo != NULL) { |
|
444 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); |
|
445 { |
|
446 Mutex m; |
|
447 if (nonConstThis->fCachedIdentifierInfo == NULL) { |
|
448 nonConstThis->fCachedIdentifierInfo = idInfo; |
|
449 idInfo = NULL; |
|
450 } |
|
451 } |
|
452 delete idInfo; |
|
453 } |
|
454 } |
|
455 |
|
456 |
|
457 |
|
458 |
|
459 //---------------------------------------------------------------------------------------------- |
|
460 // |
|
461 // class SpoofData Implementation |
|
462 // |
|
463 //---------------------------------------------------------------------------------------------- |
|
464 |
|
465 |
|
466 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { |
|
467 if (U_FAILURE(status) || |
|
468 rawData == NULL || |
|
469 rawData->fMagic != USPOOF_MAGIC || |
|
470 rawData->fFormatVersion[0] > 1 || |
|
471 rawData->fFormatVersion[1] > 0) { |
|
472 status = U_INVALID_FORMAT_ERROR; |
|
473 return FALSE; |
|
474 } |
|
475 return TRUE; |
|
476 } |
|
477 |
|
478 // |
|
479 // SpoofData::getDefault() - return a wrapper around the spoof data that is |
|
480 // baked into the default ICU data. |
|
481 // |
|
482 SpoofData *SpoofData::getDefault(UErrorCode &status) { |
|
483 // TODO: Cache it. Lazy create, keep until cleanup. |
|
484 |
|
485 UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status); |
|
486 if (U_FAILURE(status)) { |
|
487 return NULL; |
|
488 } |
|
489 SpoofData *This = new SpoofData(udm, status); |
|
490 if (U_FAILURE(status)) { |
|
491 delete This; |
|
492 return NULL; |
|
493 } |
|
494 if (This == NULL) { |
|
495 status = U_MEMORY_ALLOCATION_ERROR; |
|
496 } |
|
497 return This; |
|
498 } |
|
499 |
|
500 |
|
501 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) |
|
502 { |
|
503 reset(); |
|
504 if (U_FAILURE(status)) { |
|
505 return; |
|
506 } |
|
507 fRawData = reinterpret_cast<SpoofDataHeader *> |
|
508 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); |
|
509 fUDM = udm; |
|
510 validateDataVersion(fRawData, status); |
|
511 initPtrs(status); |
|
512 } |
|
513 |
|
514 |
|
515 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) |
|
516 { |
|
517 reset(); |
|
518 if (U_FAILURE(status)) { |
|
519 return; |
|
520 } |
|
521 if ((size_t)length < sizeof(SpoofDataHeader)) { |
|
522 status = U_INVALID_FORMAT_ERROR; |
|
523 return; |
|
524 } |
|
525 void *ncData = const_cast<void *>(data); |
|
526 fRawData = static_cast<SpoofDataHeader *>(ncData); |
|
527 if (length < fRawData->fLength) { |
|
528 status = U_INVALID_FORMAT_ERROR; |
|
529 return; |
|
530 } |
|
531 validateDataVersion(fRawData, status); |
|
532 initPtrs(status); |
|
533 } |
|
534 |
|
535 |
|
536 // Spoof Data constructor for use from data builder. |
|
537 // Initializes a new, empty data area that will be populated later. |
|
538 SpoofData::SpoofData(UErrorCode &status) { |
|
539 reset(); |
|
540 if (U_FAILURE(status)) { |
|
541 return; |
|
542 } |
|
543 fDataOwned = true; |
|
544 fRefCount = 1; |
|
545 |
|
546 // The spoof header should already be sized to be a multiple of 16 bytes. |
|
547 // Just in case it's not, round it up. |
|
548 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; |
|
549 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); |
|
550 |
|
551 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); |
|
552 fMemLimit = initialSize; |
|
553 if (fRawData == NULL) { |
|
554 status = U_MEMORY_ALLOCATION_ERROR; |
|
555 return; |
|
556 } |
|
557 uprv_memset(fRawData, 0, initialSize); |
|
558 |
|
559 fRawData->fMagic = USPOOF_MAGIC; |
|
560 fRawData->fFormatVersion[0] = 1; |
|
561 fRawData->fFormatVersion[1] = 0; |
|
562 fRawData->fFormatVersion[2] = 0; |
|
563 fRawData->fFormatVersion[3] = 0; |
|
564 initPtrs(status); |
|
565 } |
|
566 |
|
567 // reset() - initialize all fields. |
|
568 // Should be updated if any new fields are added. |
|
569 // Called by constructors to put things in a known initial state. |
|
570 void SpoofData::reset() { |
|
571 fRawData = NULL; |
|
572 fDataOwned = FALSE; |
|
573 fUDM = NULL; |
|
574 fMemLimit = 0; |
|
575 fRefCount = 1; |
|
576 fCFUKeys = NULL; |
|
577 fCFUValues = NULL; |
|
578 fCFUStringLengths = NULL; |
|
579 fCFUStrings = NULL; |
|
580 fAnyCaseTrie = NULL; |
|
581 fLowerCaseTrie = NULL; |
|
582 fScriptSets = NULL; |
|
583 } |
|
584 |
|
585 |
|
586 // SpoofData::initPtrs() |
|
587 // Initialize the pointers to the various sections of the raw data. |
|
588 // |
|
589 // This function is used both during the Trie building process (multiple |
|
590 // times, as the individual data sections are added), and |
|
591 // during the opening of a Spoof Checker from prebuilt data. |
|
592 // |
|
593 // The pointers for non-existent data sections (identified by an offset of 0) |
|
594 // are set to NULL. |
|
595 // |
|
596 // Note: During building the data, adding each new data section |
|
597 // reallocs the raw data area, which likely relocates it, which |
|
598 // in turn requires reinitializing all of the pointers into it, hence |
|
599 // multiple calls to this function during building. |
|
600 // |
|
601 void SpoofData::initPtrs(UErrorCode &status) { |
|
602 fCFUKeys = NULL; |
|
603 fCFUValues = NULL; |
|
604 fCFUStringLengths = NULL; |
|
605 fCFUStrings = NULL; |
|
606 if (U_FAILURE(status)) { |
|
607 return; |
|
608 } |
|
609 if (fRawData->fCFUKeys != 0) { |
|
610 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); |
|
611 } |
|
612 if (fRawData->fCFUStringIndex != 0) { |
|
613 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); |
|
614 } |
|
615 if (fRawData->fCFUStringLengths != 0) { |
|
616 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); |
|
617 } |
|
618 if (fRawData->fCFUStringTable != 0) { |
|
619 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); |
|
620 } |
|
621 |
|
622 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { |
|
623 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
|
624 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); |
|
625 } |
|
626 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { |
|
627 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
|
628 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); |
|
629 } |
|
630 |
|
631 if (fRawData->fScriptSets != 0) { |
|
632 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); |
|
633 } |
|
634 } |
|
635 |
|
636 |
|
637 SpoofData::~SpoofData() { |
|
638 utrie2_close(fAnyCaseTrie); |
|
639 fAnyCaseTrie = NULL; |
|
640 utrie2_close(fLowerCaseTrie); |
|
641 fLowerCaseTrie = NULL; |
|
642 if (fDataOwned) { |
|
643 uprv_free(fRawData); |
|
644 } |
|
645 fRawData = NULL; |
|
646 if (fUDM != NULL) { |
|
647 udata_close(fUDM); |
|
648 } |
|
649 fUDM = NULL; |
|
650 } |
|
651 |
|
652 |
|
653 void SpoofData::removeReference() { |
|
654 if (umtx_atomic_dec(&fRefCount) == 0) { |
|
655 delete this; |
|
656 } |
|
657 } |
|
658 |
|
659 |
|
660 SpoofData *SpoofData::addReference() { |
|
661 umtx_atomic_inc(&fRefCount); |
|
662 return this; |
|
663 } |
|
664 |
|
665 |
|
666 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { |
|
667 if (U_FAILURE(status)) { |
|
668 return NULL; |
|
669 } |
|
670 if (!fDataOwned) { |
|
671 U_ASSERT(FALSE); |
|
672 status = U_INTERNAL_PROGRAM_ERROR; |
|
673 return NULL; |
|
674 } |
|
675 |
|
676 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 |
|
677 uint32_t returnOffset = fMemLimit; |
|
678 fMemLimit += numBytes; |
|
679 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); |
|
680 fRawData->fLength = fMemLimit; |
|
681 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); |
|
682 initPtrs(status); |
|
683 return (char *)fRawData + returnOffset; |
|
684 } |
|
685 |
|
686 |
|
687 U_NAMESPACE_END |
|
688 |
|
689 U_NAMESPACE_USE |
|
690 |
|
691 //----------------------------------------------------------------------------- |
|
692 // |
|
693 // uspoof_swap - byte swap and char encoding swap of spoof data |
|
694 // |
|
695 //----------------------------------------------------------------------------- |
|
696 U_CAPI int32_t U_EXPORT2 |
|
697 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
|
698 UErrorCode *status) { |
|
699 |
|
700 if (status == NULL || U_FAILURE(*status)) { |
|
701 return 0; |
|
702 } |
|
703 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
|
704 *status=U_ILLEGAL_ARGUMENT_ERROR; |
|
705 return 0; |
|
706 } |
|
707 |
|
708 // |
|
709 // Check that the data header is for spoof data. |
|
710 // (Header contents are defined in gencfu.cpp) |
|
711 // |
|
712 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); |
|
713 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ |
|
714 pInfo->dataFormat[1]==0x66 && |
|
715 pInfo->dataFormat[2]==0x75 && |
|
716 pInfo->dataFormat[3]==0x20 && |
|
717 pInfo->formatVersion[0]==1 )) { |
|
718 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " |
|
719 "(format version %02x %02x %02x %02x) is not recognized\n", |
|
720 pInfo->dataFormat[0], pInfo->dataFormat[1], |
|
721 pInfo->dataFormat[2], pInfo->dataFormat[3], |
|
722 pInfo->formatVersion[0], pInfo->formatVersion[1], |
|
723 pInfo->formatVersion[2], pInfo->formatVersion[3]); |
|
724 *status=U_UNSUPPORTED_ERROR; |
|
725 return 0; |
|
726 } |
|
727 |
|
728 // |
|
729 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific |
|
730 // header). This swap also conveniently gets us |
|
731 // the size of the ICU d.h., which lets us locate the start |
|
732 // of the uspoof specific data. |
|
733 // |
|
734 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); |
|
735 |
|
736 |
|
737 // |
|
738 // Get the Spoof Data Header, and check that it appears to be OK. |
|
739 // |
|
740 // |
|
741 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
|
742 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; |
|
743 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || |
|
744 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) |
|
745 { |
|
746 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); |
|
747 *status=U_UNSUPPORTED_ERROR; |
|
748 return 0; |
|
749 } |
|
750 |
|
751 // |
|
752 // Prefight operation? Just return the size |
|
753 // |
|
754 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); |
|
755 int32_t totalSize = headerSize + spoofDataLength; |
|
756 if (length < 0) { |
|
757 return totalSize; |
|
758 } |
|
759 |
|
760 // |
|
761 // Check that length passed in is consistent with length from Spoof data header. |
|
762 // |
|
763 if (length < totalSize) { |
|
764 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", |
|
765 spoofDataLength); |
|
766 *status=U_INDEX_OUTOFBOUNDS_ERROR; |
|
767 return 0; |
|
768 } |
|
769 |
|
770 |
|
771 // |
|
772 // Swap the Data. Do the data itself first, then the Spoof Data Header, because |
|
773 // we need to reference the header to locate the data, and an |
|
774 // inplace swap of the header leaves it unusable. |
|
775 // |
|
776 uint8_t *outBytes = (uint8_t *)outData + headerSize; |
|
777 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; |
|
778 |
|
779 int32_t sectionStart; |
|
780 int32_t sectionLength; |
|
781 |
|
782 // |
|
783 // If not swapping in place, zero out the output buffer before starting. |
|
784 // Gaps may exist between the individual sections, and these must be zeroed in |
|
785 // the output buffer. The simplest way to do that is to just zero the whole thing. |
|
786 // |
|
787 if (inBytes != outBytes) { |
|
788 uprv_memset(outBytes, 0, spoofDataLength); |
|
789 } |
|
790 |
|
791 // Confusables Keys Section (fCFUKeys) |
|
792 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); |
|
793 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; |
|
794 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
795 |
|
796 // String Index Section |
|
797 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); |
|
798 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; |
|
799 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
800 |
|
801 // String Table Section |
|
802 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); |
|
803 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; |
|
804 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
805 |
|
806 // String Lengths Section |
|
807 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); |
|
808 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; |
|
809 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
810 |
|
811 // Any Case Trie |
|
812 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); |
|
813 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); |
|
814 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
815 |
|
816 // Lower Case Trie |
|
817 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); |
|
818 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); |
|
819 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
820 |
|
821 // Script Sets. The data is an array of int32_t |
|
822 sectionStart = ds->readUInt32(spoofDH->fScriptSets); |
|
823 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); |
|
824 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); |
|
825 |
|
826 // And, last, swap the header itself. |
|
827 // int32_t fMagic // swap this |
|
828 // uint8_t fFormatVersion[4] // Do not swap this, just copy |
|
829 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. |
|
830 // |
|
831 uint32_t magic = ds->readUInt32(spoofDH->fMagic); |
|
832 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); |
|
833 |
|
834 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { |
|
835 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); |
|
836 } |
|
837 // swap starting at fLength |
|
838 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); |
|
839 |
|
840 return totalSize; |
|
841 } |
|
842 |
|
843 #endif |
|
844 |
|
845 |