intl/icu/source/i18n/uspoof_impl.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2008-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9 #include "unicode/uspoof.h"
michael@0 10 #include "unicode/uchar.h"
michael@0 11 #include "unicode/uniset.h"
michael@0 12 #include "unicode/utf16.h"
michael@0 13 #include "utrie2.h"
michael@0 14 #include "cmemory.h"
michael@0 15 #include "cstring.h"
michael@0 16 #include "identifier_info.h"
michael@0 17 #include "scriptset.h"
michael@0 18 #include "udatamem.h"
michael@0 19 #include "umutex.h"
michael@0 20 #include "udataswp.h"
michael@0 21 #include "uassert.h"
michael@0 22 #include "uspoof_impl.h"
michael@0 23
michael@0 24 #if !UCONFIG_NO_NORMALIZATION
michael@0 25
michael@0 26
michael@0 27 U_NAMESPACE_BEGIN
michael@0 28
michael@0 29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
michael@0 30
michael@0 31 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
michael@0 32 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
michael@0 33 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
michael@0 34 if (U_FAILURE(status)) {
michael@0 35 return;
michael@0 36 }
michael@0 37 fSpoofData = data;
michael@0 38 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
michael@0 39
michael@0 40 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
michael@0 41 allowedCharsSet->freeze();
michael@0 42 fAllowedCharsSet = allowedCharsSet;
michael@0 43 fAllowedLocales = uprv_strdup("");
michael@0 44 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
michael@0 45 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 46 return;
michael@0 47 }
michael@0 48 fMagic = USPOOF_MAGIC;
michael@0 49 }
michael@0 50
michael@0 51
michael@0 52 SpoofImpl::SpoofImpl() :
michael@0 53 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
michael@0 54 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
michael@0 55 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
michael@0 56 allowedCharsSet->freeze();
michael@0 57 fAllowedCharsSet = allowedCharsSet;
michael@0 58 fAllowedLocales = uprv_strdup("");
michael@0 59 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
michael@0 60 }
michael@0 61
michael@0 62
michael@0 63 // Copy Constructor, used by the user level clone() function.
michael@0 64 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
michael@0 65 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
michael@0 66 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
michael@0 67 if (U_FAILURE(status)) {
michael@0 68 return;
michael@0 69 }
michael@0 70 fMagic = src.fMagic;
michael@0 71 fChecks = src.fChecks;
michael@0 72 if (src.fSpoofData != NULL) {
michael@0 73 fSpoofData = src.fSpoofData->addReference();
michael@0 74 }
michael@0 75 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
michael@0 76 if (fAllowedCharsSet == NULL) {
michael@0 77 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 78 }
michael@0 79 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
michael@0 80 fRestrictionLevel = src.fRestrictionLevel;
michael@0 81 }
michael@0 82
michael@0 83 SpoofImpl::~SpoofImpl() {
michael@0 84 fMagic = 0; // head off application errors by preventing use of
michael@0 85 // of deleted objects.
michael@0 86 if (fSpoofData != NULL) {
michael@0 87 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
michael@0 88 }
michael@0 89 delete fAllowedCharsSet;
michael@0 90 uprv_free((void *)fAllowedLocales);
michael@0 91 delete fCachedIdentifierInfo;
michael@0 92 }
michael@0 93
michael@0 94 //
michael@0 95 // Incoming parameter check on Status and the SpoofChecker object
michael@0 96 // received from the C API.
michael@0 97 //
michael@0 98 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
michael@0 99 if (U_FAILURE(status)) {
michael@0 100 return NULL;
michael@0 101 }
michael@0 102 if (sc == NULL) {
michael@0 103 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 104 return NULL;
michael@0 105 }
michael@0 106 SpoofImpl *This = (SpoofImpl *)sc;
michael@0 107 if (This->fMagic != USPOOF_MAGIC ||
michael@0 108 This->fSpoofData == NULL) {
michael@0 109 status = U_INVALID_FORMAT_ERROR;
michael@0 110 return NULL;
michael@0 111 }
michael@0 112 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
michael@0 113 return NULL;
michael@0 114 }
michael@0 115 return This;
michael@0 116 }
michael@0 117
michael@0 118 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
michael@0 119 return const_cast<SpoofImpl *>
michael@0 120 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
michael@0 121 }
michael@0 122
michael@0 123
michael@0 124
michael@0 125 //--------------------------------------------------------------------------------------
michael@0 126 //
michael@0 127 // confusableLookup() This is the heart of the confusable skeleton generation
michael@0 128 // implementation.
michael@0 129 //
michael@0 130 // Given a source character, produce the corresponding
michael@0 131 // replacement character(s), appending them to the dest string.
michael@0 132 //
michael@0 133 //---------------------------------------------------------------------------------------
michael@0 134 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
michael@0 135
michael@0 136 // Binary search the spoof data key table for the inChar
michael@0 137 int32_t *low = fSpoofData->fCFUKeys;
michael@0 138 int32_t *mid = NULL;
michael@0 139 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
michael@0 140 UChar32 midc;
michael@0 141 do {
michael@0 142 int32_t delta = ((int32_t)(limit-low))/2;
michael@0 143 mid = low + delta;
michael@0 144 midc = *mid & 0x1fffff;
michael@0 145 if (inChar == midc) {
michael@0 146 goto foundChar;
michael@0 147 } else if (inChar < midc) {
michael@0 148 limit = mid;
michael@0 149 } else {
michael@0 150 low = mid;
michael@0 151 }
michael@0 152 } while (low < limit-1);
michael@0 153 mid = low;
michael@0 154 midc = *mid & 0x1fffff;
michael@0 155 if (inChar != midc) {
michael@0 156 // Char not found. It maps to itself.
michael@0 157 int i = 0;
michael@0 158 dest.append(inChar);
michael@0 159 return i;
michael@0 160 }
michael@0 161 foundChar:
michael@0 162 int32_t keyFlags = *mid & 0xff000000;
michael@0 163 if ((keyFlags & tableMask) == 0) {
michael@0 164 // We found the right key char, but the entry doesn't pertain to the
michael@0 165 // table we need. See if there is an adjacent key that does
michael@0 166 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
michael@0 167 int32_t *altMid;
michael@0 168 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
michael@0 169 keyFlags = *altMid & 0xff000000;
michael@0 170 if (keyFlags & tableMask) {
michael@0 171 mid = altMid;
michael@0 172 goto foundKey;
michael@0 173 }
michael@0 174 }
michael@0 175 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
michael@0 176 keyFlags = *altMid & 0xff000000;
michael@0 177 if (keyFlags & tableMask) {
michael@0 178 mid = altMid;
michael@0 179 goto foundKey;
michael@0 180 }
michael@0 181 }
michael@0 182 }
michael@0 183 // No key entry for this char & table.
michael@0 184 // The input char maps to itself.
michael@0 185 int i = 0;
michael@0 186 dest.append(inChar);
michael@0 187 return i;
michael@0 188 }
michael@0 189
michael@0 190 foundKey:
michael@0 191 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
michael@0 192 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
michael@0 193
michael@0 194 // Value is either a UChar (for strings of length 1) or
michael@0 195 // an index into the string table (for longer strings)
michael@0 196 uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
michael@0 197 if (stringLen == 1) {
michael@0 198 dest.append((UChar)value);
michael@0 199 return 1;
michael@0 200 }
michael@0 201
michael@0 202 // String length of 4 from the above lookup is used for all strings of length >= 4.
michael@0 203 // For these, get the real length from the string lengths table,
michael@0 204 // which maps string table indexes to lengths.
michael@0 205 // All strings of the same length are stored contiguously in the string table.
michael@0 206 // 'value' from the lookup above is the starting index for the desired string.
michael@0 207
michael@0 208 int32_t ix;
michael@0 209 if (stringLen == 4) {
michael@0 210 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
michael@0 211 for (ix = 0; ix < stringLengthsLimit; ix++) {
michael@0 212 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
michael@0 213 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
michael@0 214 break;
michael@0 215 }
michael@0 216 }
michael@0 217 U_ASSERT(ix < stringLengthsLimit);
michael@0 218 }
michael@0 219
michael@0 220 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
michael@0 221 UChar *src = &fSpoofData->fCFUStrings[value];
michael@0 222 dest.append(src, stringLen);
michael@0 223 return stringLen;
michael@0 224 }
michael@0 225
michael@0 226
michael@0 227 //---------------------------------------------------------------------------------------
michael@0 228 //
michael@0 229 // wholeScriptCheck()
michael@0 230 //
michael@0 231 // Input text is already normalized to NFD
michael@0 232 // Return the set of scripts, each of which can represent something that is
michael@0 233 // confusable with the input text. The script of the input text
michael@0 234 // is included; input consisting of characters from a single script will
michael@0 235 // always produce a result consisting of a set containing that script.
michael@0 236 //
michael@0 237 //---------------------------------------------------------------------------------------
michael@0 238 void SpoofImpl::wholeScriptCheck(
michael@0 239 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
michael@0 240
michael@0 241 UTrie2 *table =
michael@0 242 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
michael@0 243 result->setAll();
michael@0 244 int32_t length = text.length();
michael@0 245 for (int32_t inputIdx=0; inputIdx < length;) {
michael@0 246 UChar32 c = text.char32At(inputIdx);
michael@0 247 inputIdx += U16_LENGTH(c);
michael@0 248 uint32_t index = utrie2_get32(table, c);
michael@0 249 if (index == 0) {
michael@0 250 // No confusables in another script for this char.
michael@0 251 // TODO: we should change the data to have sets with just the single script
michael@0 252 // bit for the script of this char. Gets rid of this special case.
michael@0 253 // Until then, grab the script from the char and intersect it with the set.
michael@0 254 UScriptCode cpScript = uscript_getScript(c, &status);
michael@0 255 U_ASSERT(cpScript > USCRIPT_INHERITED);
michael@0 256 result->intersect(cpScript, status);
michael@0 257 } else if (index == 1) {
michael@0 258 // Script == Common or Inherited. Nothing to do.
michael@0 259 } else {
michael@0 260 result->intersect(fSpoofData->fScriptSets[index]);
michael@0 261 }
michael@0 262 }
michael@0 263 }
michael@0 264
michael@0 265
michael@0 266 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
michael@0 267 UnicodeSet allowedChars;
michael@0 268 UnicodeSet *tmpSet = NULL;
michael@0 269 const char *locStart = localesList;
michael@0 270 const char *locEnd = NULL;
michael@0 271 const char *localesListEnd = localesList + uprv_strlen(localesList);
michael@0 272 int32_t localeListCount = 0; // Number of locales provided by caller.
michael@0 273
michael@0 274 // Loop runs once per locale from the localesList, a comma separated list of locales.
michael@0 275 do {
michael@0 276 locEnd = uprv_strchr(locStart, ',');
michael@0 277 if (locEnd == NULL) {
michael@0 278 locEnd = localesListEnd;
michael@0 279 }
michael@0 280 while (*locStart == ' ') {
michael@0 281 locStart++;
michael@0 282 }
michael@0 283 const char *trimmedEnd = locEnd-1;
michael@0 284 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
michael@0 285 trimmedEnd--;
michael@0 286 }
michael@0 287 if (trimmedEnd <= locStart) {
michael@0 288 break;
michael@0 289 }
michael@0 290 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
michael@0 291 localeListCount++;
michael@0 292
michael@0 293 // We have one locale from the locales list.
michael@0 294 // Add the script chars for this locale to the accumulating set of allowed chars.
michael@0 295 // If the locale is no good, we will be notified back via status.
michael@0 296 addScriptChars(locale, &allowedChars, status);
michael@0 297 uprv_free((void *)locale);
michael@0 298 if (U_FAILURE(status)) {
michael@0 299 break;
michael@0 300 }
michael@0 301 locStart = locEnd + 1;
michael@0 302 } while (locStart < localesListEnd);
michael@0 303
michael@0 304 // If our caller provided an empty list of locales, we disable the allowed characters checking
michael@0 305 if (localeListCount == 0) {
michael@0 306 uprv_free((void *)fAllowedLocales);
michael@0 307 fAllowedLocales = uprv_strdup("");
michael@0 308 tmpSet = new UnicodeSet(0, 0x10ffff);
michael@0 309 if (fAllowedLocales == NULL || tmpSet == NULL) {
michael@0 310 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 311 return;
michael@0 312 }
michael@0 313 tmpSet->freeze();
michael@0 314 delete fAllowedCharsSet;
michael@0 315 fAllowedCharsSet = tmpSet;
michael@0 316 fChecks &= ~USPOOF_CHAR_LIMIT;
michael@0 317 return;
michael@0 318 }
michael@0 319
michael@0 320
michael@0 321 // Add all common and inherited characters to the set of allowed chars.
michael@0 322 UnicodeSet tempSet;
michael@0 323 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
michael@0 324 allowedChars.addAll(tempSet);
michael@0 325 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
michael@0 326 allowedChars.addAll(tempSet);
michael@0 327
michael@0 328 // If anything went wrong, we bail out without changing
michael@0 329 // the state of the spoof checker.
michael@0 330 if (U_FAILURE(status)) {
michael@0 331 return;
michael@0 332 }
michael@0 333
michael@0 334 // Store the updated spoof checker state.
michael@0 335 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
michael@0 336 const char *tmpLocalesList = uprv_strdup(localesList);
michael@0 337 if (tmpSet == NULL || tmpLocalesList == NULL) {
michael@0 338 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 339 return;
michael@0 340 }
michael@0 341 uprv_free((void *)fAllowedLocales);
michael@0 342 fAllowedLocales = tmpLocalesList;
michael@0 343 tmpSet->freeze();
michael@0 344 delete fAllowedCharsSet;
michael@0 345 fAllowedCharsSet = tmpSet;
michael@0 346 fChecks |= USPOOF_CHAR_LIMIT;
michael@0 347 }
michael@0 348
michael@0 349
michael@0 350 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
michael@0 351 return fAllowedLocales;
michael@0 352 }
michael@0 353
michael@0 354
michael@0 355 // Given a locale (a language), add all the characters from all of the scripts used with that language
michael@0 356 // to the allowedChars UnicodeSet
michael@0 357
michael@0 358 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
michael@0 359 UScriptCode scripts[30];
michael@0 360
michael@0 361 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
michael@0 362 if (U_FAILURE(status)) {
michael@0 363 return;
michael@0 364 }
michael@0 365 if (status == U_USING_DEFAULT_WARNING) {
michael@0 366 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 367 return;
michael@0 368 }
michael@0 369 UnicodeSet tmpSet;
michael@0 370 int32_t i;
michael@0 371 for (i=0; i<numScripts; i++) {
michael@0 372 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
michael@0 373 allowedChars->addAll(tmpSet);
michael@0 374 }
michael@0 375 }
michael@0 376
michael@0 377
michael@0 378 // Convert a text format hex number. Utility function used by builder code. Static.
michael@0 379 // Input: UChar *string text. Output: a UChar32
michael@0 380 // Input has been pre-checked, and will have no non-hex chars.
michael@0 381 // The number must fall in the code point range of 0..0x10ffff
michael@0 382 // Static Function.
michael@0 383 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
michael@0 384 if (U_FAILURE(status)) {
michael@0 385 return 0;
michael@0 386 }
michael@0 387 U_ASSERT(limit-start > 0);
michael@0 388 uint32_t val = 0;
michael@0 389 int i;
michael@0 390 for (i=start; i<limit; i++) {
michael@0 391 int digitVal = s[i] - 0x30;
michael@0 392 if (digitVal>9) {
michael@0 393 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
michael@0 394 }
michael@0 395 if (digitVal>15) {
michael@0 396 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
michael@0 397 }
michael@0 398 U_ASSERT(digitVal <= 0xf);
michael@0 399 val <<= 4;
michael@0 400 val += digitVal;
michael@0 401 }
michael@0 402 if (val > 0x10ffff) {
michael@0 403 status = U_PARSE_ERROR;
michael@0 404 val = 0;
michael@0 405 }
michael@0 406 return (UChar32)val;
michael@0 407 }
michael@0 408
michael@0 409 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
michael@0 410 // Maintain a one-element cache, which is sufficient to avoid repeatedly
michael@0 411 // creating new ones unless we get multi-thread concurrency in spoof
michael@0 412 // check operations, which should be statistically uncommon.
michael@0 413
michael@0 414 // These functions are used in place of new & delete of an IdentifierInfo.
michael@0 415 // They will recycle the IdentifierInfo when possible.
michael@0 416 // They are logically const, and used within const functions that must be thread safe.
michael@0 417 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
michael@0 418 IdentifierInfo *returnIdInfo = NULL;
michael@0 419 if (U_FAILURE(status)) {
michael@0 420 return returnIdInfo;
michael@0 421 }
michael@0 422 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
michael@0 423 {
michael@0 424 Mutex m;
michael@0 425 returnIdInfo = nonConstThis->fCachedIdentifierInfo;
michael@0 426 nonConstThis->fCachedIdentifierInfo = NULL;
michael@0 427 }
michael@0 428 if (returnIdInfo == NULL) {
michael@0 429 returnIdInfo = new IdentifierInfo(status);
michael@0 430 if (U_SUCCESS(status) && returnIdInfo == NULL) {
michael@0 431 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 432 }
michael@0 433 if (U_FAILURE(status) && returnIdInfo != NULL) {
michael@0 434 delete returnIdInfo;
michael@0 435 returnIdInfo = NULL;
michael@0 436 }
michael@0 437 }
michael@0 438 return returnIdInfo;
michael@0 439 }
michael@0 440
michael@0 441
michael@0 442 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
michael@0 443 if (idInfo != NULL) {
michael@0 444 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
michael@0 445 {
michael@0 446 Mutex m;
michael@0 447 if (nonConstThis->fCachedIdentifierInfo == NULL) {
michael@0 448 nonConstThis->fCachedIdentifierInfo = idInfo;
michael@0 449 idInfo = NULL;
michael@0 450 }
michael@0 451 }
michael@0 452 delete idInfo;
michael@0 453 }
michael@0 454 }
michael@0 455
michael@0 456
michael@0 457
michael@0 458
michael@0 459 //----------------------------------------------------------------------------------------------
michael@0 460 //
michael@0 461 // class SpoofData Implementation
michael@0 462 //
michael@0 463 //----------------------------------------------------------------------------------------------
michael@0 464
michael@0 465
michael@0 466 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
michael@0 467 if (U_FAILURE(status) ||
michael@0 468 rawData == NULL ||
michael@0 469 rawData->fMagic != USPOOF_MAGIC ||
michael@0 470 rawData->fFormatVersion[0] > 1 ||
michael@0 471 rawData->fFormatVersion[1] > 0) {
michael@0 472 status = U_INVALID_FORMAT_ERROR;
michael@0 473 return FALSE;
michael@0 474 }
michael@0 475 return TRUE;
michael@0 476 }
michael@0 477
michael@0 478 //
michael@0 479 // SpoofData::getDefault() - return a wrapper around the spoof data that is
michael@0 480 // baked into the default ICU data.
michael@0 481 //
michael@0 482 SpoofData *SpoofData::getDefault(UErrorCode &status) {
michael@0 483 // TODO: Cache it. Lazy create, keep until cleanup.
michael@0 484
michael@0 485 UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status);
michael@0 486 if (U_FAILURE(status)) {
michael@0 487 return NULL;
michael@0 488 }
michael@0 489 SpoofData *This = new SpoofData(udm, status);
michael@0 490 if (U_FAILURE(status)) {
michael@0 491 delete This;
michael@0 492 return NULL;
michael@0 493 }
michael@0 494 if (This == NULL) {
michael@0 495 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 496 }
michael@0 497 return This;
michael@0 498 }
michael@0 499
michael@0 500
michael@0 501 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
michael@0 502 {
michael@0 503 reset();
michael@0 504 if (U_FAILURE(status)) {
michael@0 505 return;
michael@0 506 }
michael@0 507 fRawData = reinterpret_cast<SpoofDataHeader *>
michael@0 508 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
michael@0 509 fUDM = udm;
michael@0 510 validateDataVersion(fRawData, status);
michael@0 511 initPtrs(status);
michael@0 512 }
michael@0 513
michael@0 514
michael@0 515 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
michael@0 516 {
michael@0 517 reset();
michael@0 518 if (U_FAILURE(status)) {
michael@0 519 return;
michael@0 520 }
michael@0 521 if ((size_t)length < sizeof(SpoofDataHeader)) {
michael@0 522 status = U_INVALID_FORMAT_ERROR;
michael@0 523 return;
michael@0 524 }
michael@0 525 void *ncData = const_cast<void *>(data);
michael@0 526 fRawData = static_cast<SpoofDataHeader *>(ncData);
michael@0 527 if (length < fRawData->fLength) {
michael@0 528 status = U_INVALID_FORMAT_ERROR;
michael@0 529 return;
michael@0 530 }
michael@0 531 validateDataVersion(fRawData, status);
michael@0 532 initPtrs(status);
michael@0 533 }
michael@0 534
michael@0 535
michael@0 536 // Spoof Data constructor for use from data builder.
michael@0 537 // Initializes a new, empty data area that will be populated later.
michael@0 538 SpoofData::SpoofData(UErrorCode &status) {
michael@0 539 reset();
michael@0 540 if (U_FAILURE(status)) {
michael@0 541 return;
michael@0 542 }
michael@0 543 fDataOwned = true;
michael@0 544 fRefCount = 1;
michael@0 545
michael@0 546 // The spoof header should already be sized to be a multiple of 16 bytes.
michael@0 547 // Just in case it's not, round it up.
michael@0 548 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
michael@0 549 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
michael@0 550
michael@0 551 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
michael@0 552 fMemLimit = initialSize;
michael@0 553 if (fRawData == NULL) {
michael@0 554 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 555 return;
michael@0 556 }
michael@0 557 uprv_memset(fRawData, 0, initialSize);
michael@0 558
michael@0 559 fRawData->fMagic = USPOOF_MAGIC;
michael@0 560 fRawData->fFormatVersion[0] = 1;
michael@0 561 fRawData->fFormatVersion[1] = 0;
michael@0 562 fRawData->fFormatVersion[2] = 0;
michael@0 563 fRawData->fFormatVersion[3] = 0;
michael@0 564 initPtrs(status);
michael@0 565 }
michael@0 566
michael@0 567 // reset() - initialize all fields.
michael@0 568 // Should be updated if any new fields are added.
michael@0 569 // Called by constructors to put things in a known initial state.
michael@0 570 void SpoofData::reset() {
michael@0 571 fRawData = NULL;
michael@0 572 fDataOwned = FALSE;
michael@0 573 fUDM = NULL;
michael@0 574 fMemLimit = 0;
michael@0 575 fRefCount = 1;
michael@0 576 fCFUKeys = NULL;
michael@0 577 fCFUValues = NULL;
michael@0 578 fCFUStringLengths = NULL;
michael@0 579 fCFUStrings = NULL;
michael@0 580 fAnyCaseTrie = NULL;
michael@0 581 fLowerCaseTrie = NULL;
michael@0 582 fScriptSets = NULL;
michael@0 583 }
michael@0 584
michael@0 585
michael@0 586 // SpoofData::initPtrs()
michael@0 587 // Initialize the pointers to the various sections of the raw data.
michael@0 588 //
michael@0 589 // This function is used both during the Trie building process (multiple
michael@0 590 // times, as the individual data sections are added), and
michael@0 591 // during the opening of a Spoof Checker from prebuilt data.
michael@0 592 //
michael@0 593 // The pointers for non-existent data sections (identified by an offset of 0)
michael@0 594 // are set to NULL.
michael@0 595 //
michael@0 596 // Note: During building the data, adding each new data section
michael@0 597 // reallocs the raw data area, which likely relocates it, which
michael@0 598 // in turn requires reinitializing all of the pointers into it, hence
michael@0 599 // multiple calls to this function during building.
michael@0 600 //
michael@0 601 void SpoofData::initPtrs(UErrorCode &status) {
michael@0 602 fCFUKeys = NULL;
michael@0 603 fCFUValues = NULL;
michael@0 604 fCFUStringLengths = NULL;
michael@0 605 fCFUStrings = NULL;
michael@0 606 if (U_FAILURE(status)) {
michael@0 607 return;
michael@0 608 }
michael@0 609 if (fRawData->fCFUKeys != 0) {
michael@0 610 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
michael@0 611 }
michael@0 612 if (fRawData->fCFUStringIndex != 0) {
michael@0 613 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
michael@0 614 }
michael@0 615 if (fRawData->fCFUStringLengths != 0) {
michael@0 616 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
michael@0 617 }
michael@0 618 if (fRawData->fCFUStringTable != 0) {
michael@0 619 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
michael@0 620 }
michael@0 621
michael@0 622 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
michael@0 623 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
michael@0 624 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
michael@0 625 }
michael@0 626 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
michael@0 627 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
michael@0 628 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
michael@0 629 }
michael@0 630
michael@0 631 if (fRawData->fScriptSets != 0) {
michael@0 632 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
michael@0 633 }
michael@0 634 }
michael@0 635
michael@0 636
michael@0 637 SpoofData::~SpoofData() {
michael@0 638 utrie2_close(fAnyCaseTrie);
michael@0 639 fAnyCaseTrie = NULL;
michael@0 640 utrie2_close(fLowerCaseTrie);
michael@0 641 fLowerCaseTrie = NULL;
michael@0 642 if (fDataOwned) {
michael@0 643 uprv_free(fRawData);
michael@0 644 }
michael@0 645 fRawData = NULL;
michael@0 646 if (fUDM != NULL) {
michael@0 647 udata_close(fUDM);
michael@0 648 }
michael@0 649 fUDM = NULL;
michael@0 650 }
michael@0 651
michael@0 652
michael@0 653 void SpoofData::removeReference() {
michael@0 654 if (umtx_atomic_dec(&fRefCount) == 0) {
michael@0 655 delete this;
michael@0 656 }
michael@0 657 }
michael@0 658
michael@0 659
michael@0 660 SpoofData *SpoofData::addReference() {
michael@0 661 umtx_atomic_inc(&fRefCount);
michael@0 662 return this;
michael@0 663 }
michael@0 664
michael@0 665
michael@0 666 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
michael@0 667 if (U_FAILURE(status)) {
michael@0 668 return NULL;
michael@0 669 }
michael@0 670 if (!fDataOwned) {
michael@0 671 U_ASSERT(FALSE);
michael@0 672 status = U_INTERNAL_PROGRAM_ERROR;
michael@0 673 return NULL;
michael@0 674 }
michael@0 675
michael@0 676 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
michael@0 677 uint32_t returnOffset = fMemLimit;
michael@0 678 fMemLimit += numBytes;
michael@0 679 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
michael@0 680 fRawData->fLength = fMemLimit;
michael@0 681 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
michael@0 682 initPtrs(status);
michael@0 683 return (char *)fRawData + returnOffset;
michael@0 684 }
michael@0 685
michael@0 686
michael@0 687 U_NAMESPACE_END
michael@0 688
michael@0 689 U_NAMESPACE_USE
michael@0 690
michael@0 691 //-----------------------------------------------------------------------------
michael@0 692 //
michael@0 693 // uspoof_swap - byte swap and char encoding swap of spoof data
michael@0 694 //
michael@0 695 //-----------------------------------------------------------------------------
michael@0 696 U_CAPI int32_t U_EXPORT2
michael@0 697 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
michael@0 698 UErrorCode *status) {
michael@0 699
michael@0 700 if (status == NULL || U_FAILURE(*status)) {
michael@0 701 return 0;
michael@0 702 }
michael@0 703 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
michael@0 704 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 705 return 0;
michael@0 706 }
michael@0 707
michael@0 708 //
michael@0 709 // Check that the data header is for spoof data.
michael@0 710 // (Header contents are defined in gencfu.cpp)
michael@0 711 //
michael@0 712 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
michael@0 713 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
michael@0 714 pInfo->dataFormat[1]==0x66 &&
michael@0 715 pInfo->dataFormat[2]==0x75 &&
michael@0 716 pInfo->dataFormat[3]==0x20 &&
michael@0 717 pInfo->formatVersion[0]==1 )) {
michael@0 718 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
michael@0 719 "(format version %02x %02x %02x %02x) is not recognized\n",
michael@0 720 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 721 pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0 722 pInfo->formatVersion[0], pInfo->formatVersion[1],
michael@0 723 pInfo->formatVersion[2], pInfo->formatVersion[3]);
michael@0 724 *status=U_UNSUPPORTED_ERROR;
michael@0 725 return 0;
michael@0 726 }
michael@0 727
michael@0 728 //
michael@0 729 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
michael@0 730 // header). This swap also conveniently gets us
michael@0 731 // the size of the ICU d.h., which lets us locate the start
michael@0 732 // of the uspoof specific data.
michael@0 733 //
michael@0 734 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
michael@0 735
michael@0 736
michael@0 737 //
michael@0 738 // Get the Spoof Data Header, and check that it appears to be OK.
michael@0 739 //
michael@0 740 //
michael@0 741 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
michael@0 742 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
michael@0 743 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
michael@0 744 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
michael@0 745 {
michael@0 746 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
michael@0 747 *status=U_UNSUPPORTED_ERROR;
michael@0 748 return 0;
michael@0 749 }
michael@0 750
michael@0 751 //
michael@0 752 // Prefight operation? Just return the size
michael@0 753 //
michael@0 754 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
michael@0 755 int32_t totalSize = headerSize + spoofDataLength;
michael@0 756 if (length < 0) {
michael@0 757 return totalSize;
michael@0 758 }
michael@0 759
michael@0 760 //
michael@0 761 // Check that length passed in is consistent with length from Spoof data header.
michael@0 762 //
michael@0 763 if (length < totalSize) {
michael@0 764 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
michael@0 765 spoofDataLength);
michael@0 766 *status=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 767 return 0;
michael@0 768 }
michael@0 769
michael@0 770
michael@0 771 //
michael@0 772 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
michael@0 773 // we need to reference the header to locate the data, and an
michael@0 774 // inplace swap of the header leaves it unusable.
michael@0 775 //
michael@0 776 uint8_t *outBytes = (uint8_t *)outData + headerSize;
michael@0 777 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
michael@0 778
michael@0 779 int32_t sectionStart;
michael@0 780 int32_t sectionLength;
michael@0 781
michael@0 782 //
michael@0 783 // If not swapping in place, zero out the output buffer before starting.
michael@0 784 // Gaps may exist between the individual sections, and these must be zeroed in
michael@0 785 // the output buffer. The simplest way to do that is to just zero the whole thing.
michael@0 786 //
michael@0 787 if (inBytes != outBytes) {
michael@0 788 uprv_memset(outBytes, 0, spoofDataLength);
michael@0 789 }
michael@0 790
michael@0 791 // Confusables Keys Section (fCFUKeys)
michael@0 792 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
michael@0 793 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
michael@0 794 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 795
michael@0 796 // String Index Section
michael@0 797 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
michael@0 798 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
michael@0 799 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 800
michael@0 801 // String Table Section
michael@0 802 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
michael@0 803 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
michael@0 804 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 805
michael@0 806 // String Lengths Section
michael@0 807 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
michael@0 808 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
michael@0 809 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 810
michael@0 811 // Any Case Trie
michael@0 812 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
michael@0 813 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
michael@0 814 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 815
michael@0 816 // Lower Case Trie
michael@0 817 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
michael@0 818 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
michael@0 819 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 820
michael@0 821 // Script Sets. The data is an array of int32_t
michael@0 822 sectionStart = ds->readUInt32(spoofDH->fScriptSets);
michael@0 823 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
michael@0 824 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
michael@0 825
michael@0 826 // And, last, swap the header itself.
michael@0 827 // int32_t fMagic // swap this
michael@0 828 // uint8_t fFormatVersion[4] // Do not swap this, just copy
michael@0 829 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
michael@0 830 //
michael@0 831 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
michael@0 832 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
michael@0 833
michael@0 834 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
michael@0 835 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
michael@0 836 }
michael@0 837 // swap starting at fLength
michael@0 838 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
michael@0 839
michael@0 840 return totalSize;
michael@0 841 }
michael@0 842
michael@0 843 #endif
michael@0 844
michael@0 845

mercurial