1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/uspoof_impl.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,845 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2008-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +*/ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 +#include "unicode/uspoof.h" 1.13 +#include "unicode/uchar.h" 1.14 +#include "unicode/uniset.h" 1.15 +#include "unicode/utf16.h" 1.16 +#include "utrie2.h" 1.17 +#include "cmemory.h" 1.18 +#include "cstring.h" 1.19 +#include "identifier_info.h" 1.20 +#include "scriptset.h" 1.21 +#include "udatamem.h" 1.22 +#include "umutex.h" 1.23 +#include "udataswp.h" 1.24 +#include "uassert.h" 1.25 +#include "uspoof_impl.h" 1.26 + 1.27 +#if !UCONFIG_NO_NORMALIZATION 1.28 + 1.29 + 1.30 +U_NAMESPACE_BEGIN 1.31 + 1.32 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 1.33 + 1.34 +SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : 1.35 + fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 1.36 + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 1.37 + if (U_FAILURE(status)) { 1.38 + return; 1.39 + } 1.40 + fSpoofData = data; 1.41 + fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 1.42 + 1.43 + UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 1.44 + allowedCharsSet->freeze(); 1.45 + fAllowedCharsSet = allowedCharsSet; 1.46 + fAllowedLocales = uprv_strdup(""); 1.47 + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { 1.48 + status = U_MEMORY_ALLOCATION_ERROR; 1.49 + return; 1.50 + } 1.51 + fMagic = USPOOF_MAGIC; 1.52 +} 1.53 + 1.54 + 1.55 +SpoofImpl::SpoofImpl() : 1.56 + fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 1.57 + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 1.58 + UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 1.59 + allowedCharsSet->freeze(); 1.60 + fAllowedCharsSet = allowedCharsSet; 1.61 + fAllowedLocales = uprv_strdup(""); 1.62 + fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 1.63 +} 1.64 + 1.65 + 1.66 +// Copy Constructor, used by the user level clone() function. 1.67 +SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 1.68 + fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 1.69 + fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 1.70 + if (U_FAILURE(status)) { 1.71 + return; 1.72 + } 1.73 + fMagic = src.fMagic; 1.74 + fChecks = src.fChecks; 1.75 + if (src.fSpoofData != NULL) { 1.76 + fSpoofData = src.fSpoofData->addReference(); 1.77 + } 1.78 + fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); 1.79 + if (fAllowedCharsSet == NULL) { 1.80 + status = U_MEMORY_ALLOCATION_ERROR; 1.81 + } 1.82 + fAllowedLocales = uprv_strdup(src.fAllowedLocales); 1.83 + fRestrictionLevel = src.fRestrictionLevel; 1.84 +} 1.85 + 1.86 +SpoofImpl::~SpoofImpl() { 1.87 + fMagic = 0; // head off application errors by preventing use of 1.88 + // of deleted objects. 1.89 + if (fSpoofData != NULL) { 1.90 + fSpoofData->removeReference(); // Will delete if refCount goes to zero. 1.91 + } 1.92 + delete fAllowedCharsSet; 1.93 + uprv_free((void *)fAllowedLocales); 1.94 + delete fCachedIdentifierInfo; 1.95 +} 1.96 + 1.97 +// 1.98 +// Incoming parameter check on Status and the SpoofChecker object 1.99 +// received from the C API. 1.100 +// 1.101 +const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 1.102 + if (U_FAILURE(status)) { 1.103 + return NULL; 1.104 + } 1.105 + if (sc == NULL) { 1.106 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.107 + return NULL; 1.108 + } 1.109 + SpoofImpl *This = (SpoofImpl *)sc; 1.110 + if (This->fMagic != USPOOF_MAGIC || 1.111 + This->fSpoofData == NULL) { 1.112 + status = U_INVALID_FORMAT_ERROR; 1.113 + return NULL; 1.114 + } 1.115 + if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { 1.116 + return NULL; 1.117 + } 1.118 + return This; 1.119 +} 1.120 + 1.121 +SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 1.122 + return const_cast<SpoofImpl *> 1.123 + (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 1.124 +} 1.125 + 1.126 + 1.127 + 1.128 +//-------------------------------------------------------------------------------------- 1.129 +// 1.130 +// confusableLookup() This is the heart of the confusable skeleton generation 1.131 +// implementation. 1.132 +// 1.133 +// Given a source character, produce the corresponding 1.134 +// replacement character(s), appending them to the dest string. 1.135 +// 1.136 +//--------------------------------------------------------------------------------------- 1.137 +int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { 1.138 + 1.139 + // Binary search the spoof data key table for the inChar 1.140 + int32_t *low = fSpoofData->fCFUKeys; 1.141 + int32_t *mid = NULL; 1.142 + int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; 1.143 + UChar32 midc; 1.144 + do { 1.145 + int32_t delta = ((int32_t)(limit-low))/2; 1.146 + mid = low + delta; 1.147 + midc = *mid & 0x1fffff; 1.148 + if (inChar == midc) { 1.149 + goto foundChar; 1.150 + } else if (inChar < midc) { 1.151 + limit = mid; 1.152 + } else { 1.153 + low = mid; 1.154 + } 1.155 + } while (low < limit-1); 1.156 + mid = low; 1.157 + midc = *mid & 0x1fffff; 1.158 + if (inChar != midc) { 1.159 + // Char not found. It maps to itself. 1.160 + int i = 0; 1.161 + dest.append(inChar); 1.162 + return i; 1.163 + } 1.164 + foundChar: 1.165 + int32_t keyFlags = *mid & 0xff000000; 1.166 + if ((keyFlags & tableMask) == 0) { 1.167 + // We found the right key char, but the entry doesn't pertain to the 1.168 + // table we need. See if there is an adjacent key that does 1.169 + if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { 1.170 + int32_t *altMid; 1.171 + for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { 1.172 + keyFlags = *altMid & 0xff000000; 1.173 + if (keyFlags & tableMask) { 1.174 + mid = altMid; 1.175 + goto foundKey; 1.176 + } 1.177 + } 1.178 + for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { 1.179 + keyFlags = *altMid & 0xff000000; 1.180 + if (keyFlags & tableMask) { 1.181 + mid = altMid; 1.182 + goto foundKey; 1.183 + } 1.184 + } 1.185 + } 1.186 + // No key entry for this char & table. 1.187 + // The input char maps to itself. 1.188 + int i = 0; 1.189 + dest.append(inChar); 1.190 + return i; 1.191 + } 1.192 + 1.193 + foundKey: 1.194 + int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; 1.195 + int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); 1.196 + 1.197 + // Value is either a UChar (for strings of length 1) or 1.198 + // an index into the string table (for longer strings) 1.199 + uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; 1.200 + if (stringLen == 1) { 1.201 + dest.append((UChar)value); 1.202 + return 1; 1.203 + } 1.204 + 1.205 + // String length of 4 from the above lookup is used for all strings of length >= 4. 1.206 + // For these, get the real length from the string lengths table, 1.207 + // which maps string table indexes to lengths. 1.208 + // All strings of the same length are stored contiguously in the string table. 1.209 + // 'value' from the lookup above is the starting index for the desired string. 1.210 + 1.211 + int32_t ix; 1.212 + if (stringLen == 4) { 1.213 + int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; 1.214 + for (ix = 0; ix < stringLengthsLimit; ix++) { 1.215 + if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { 1.216 + stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; 1.217 + break; 1.218 + } 1.219 + } 1.220 + U_ASSERT(ix < stringLengthsLimit); 1.221 + } 1.222 + 1.223 + U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); 1.224 + UChar *src = &fSpoofData->fCFUStrings[value]; 1.225 + dest.append(src, stringLen); 1.226 + return stringLen; 1.227 +} 1.228 + 1.229 + 1.230 +//--------------------------------------------------------------------------------------- 1.231 +// 1.232 +// wholeScriptCheck() 1.233 +// 1.234 +// Input text is already normalized to NFD 1.235 +// Return the set of scripts, each of which can represent something that is 1.236 +// confusable with the input text. The script of the input text 1.237 +// is included; input consisting of characters from a single script will 1.238 +// always produce a result consisting of a set containing that script. 1.239 +// 1.240 +//--------------------------------------------------------------------------------------- 1.241 +void SpoofImpl::wholeScriptCheck( 1.242 + const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { 1.243 + 1.244 + UTrie2 *table = 1.245 + (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; 1.246 + result->setAll(); 1.247 + int32_t length = text.length(); 1.248 + for (int32_t inputIdx=0; inputIdx < length;) { 1.249 + UChar32 c = text.char32At(inputIdx); 1.250 + inputIdx += U16_LENGTH(c); 1.251 + uint32_t index = utrie2_get32(table, c); 1.252 + if (index == 0) { 1.253 + // No confusables in another script for this char. 1.254 + // TODO: we should change the data to have sets with just the single script 1.255 + // bit for the script of this char. Gets rid of this special case. 1.256 + // Until then, grab the script from the char and intersect it with the set. 1.257 + UScriptCode cpScript = uscript_getScript(c, &status); 1.258 + U_ASSERT(cpScript > USCRIPT_INHERITED); 1.259 + result->intersect(cpScript, status); 1.260 + } else if (index == 1) { 1.261 + // Script == Common or Inherited. Nothing to do. 1.262 + } else { 1.263 + result->intersect(fSpoofData->fScriptSets[index]); 1.264 + } 1.265 + } 1.266 +} 1.267 + 1.268 + 1.269 +void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 1.270 + UnicodeSet allowedChars; 1.271 + UnicodeSet *tmpSet = NULL; 1.272 + const char *locStart = localesList; 1.273 + const char *locEnd = NULL; 1.274 + const char *localesListEnd = localesList + uprv_strlen(localesList); 1.275 + int32_t localeListCount = 0; // Number of locales provided by caller. 1.276 + 1.277 + // Loop runs once per locale from the localesList, a comma separated list of locales. 1.278 + do { 1.279 + locEnd = uprv_strchr(locStart, ','); 1.280 + if (locEnd == NULL) { 1.281 + locEnd = localesListEnd; 1.282 + } 1.283 + while (*locStart == ' ') { 1.284 + locStart++; 1.285 + } 1.286 + const char *trimmedEnd = locEnd-1; 1.287 + while (trimmedEnd > locStart && *trimmedEnd == ' ') { 1.288 + trimmedEnd--; 1.289 + } 1.290 + if (trimmedEnd <= locStart) { 1.291 + break; 1.292 + } 1.293 + const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); 1.294 + localeListCount++; 1.295 + 1.296 + // We have one locale from the locales list. 1.297 + // Add the script chars for this locale to the accumulating set of allowed chars. 1.298 + // If the locale is no good, we will be notified back via status. 1.299 + addScriptChars(locale, &allowedChars, status); 1.300 + uprv_free((void *)locale); 1.301 + if (U_FAILURE(status)) { 1.302 + break; 1.303 + } 1.304 + locStart = locEnd + 1; 1.305 + } while (locStart < localesListEnd); 1.306 + 1.307 + // If our caller provided an empty list of locales, we disable the allowed characters checking 1.308 + if (localeListCount == 0) { 1.309 + uprv_free((void *)fAllowedLocales); 1.310 + fAllowedLocales = uprv_strdup(""); 1.311 + tmpSet = new UnicodeSet(0, 0x10ffff); 1.312 + if (fAllowedLocales == NULL || tmpSet == NULL) { 1.313 + status = U_MEMORY_ALLOCATION_ERROR; 1.314 + return; 1.315 + } 1.316 + tmpSet->freeze(); 1.317 + delete fAllowedCharsSet; 1.318 + fAllowedCharsSet = tmpSet; 1.319 + fChecks &= ~USPOOF_CHAR_LIMIT; 1.320 + return; 1.321 + } 1.322 + 1.323 + 1.324 + // Add all common and inherited characters to the set of allowed chars. 1.325 + UnicodeSet tempSet; 1.326 + tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 1.327 + allowedChars.addAll(tempSet); 1.328 + tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 1.329 + allowedChars.addAll(tempSet); 1.330 + 1.331 + // If anything went wrong, we bail out without changing 1.332 + // the state of the spoof checker. 1.333 + if (U_FAILURE(status)) { 1.334 + return; 1.335 + } 1.336 + 1.337 + // Store the updated spoof checker state. 1.338 + tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); 1.339 + const char *tmpLocalesList = uprv_strdup(localesList); 1.340 + if (tmpSet == NULL || tmpLocalesList == NULL) { 1.341 + status = U_MEMORY_ALLOCATION_ERROR; 1.342 + return; 1.343 + } 1.344 + uprv_free((void *)fAllowedLocales); 1.345 + fAllowedLocales = tmpLocalesList; 1.346 + tmpSet->freeze(); 1.347 + delete fAllowedCharsSet; 1.348 + fAllowedCharsSet = tmpSet; 1.349 + fChecks |= USPOOF_CHAR_LIMIT; 1.350 +} 1.351 + 1.352 + 1.353 +const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 1.354 + return fAllowedLocales; 1.355 +} 1.356 + 1.357 + 1.358 +// Given a locale (a language), add all the characters from all of the scripts used with that language 1.359 +// to the allowedChars UnicodeSet 1.360 + 1.361 +void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 1.362 + UScriptCode scripts[30]; 1.363 + 1.364 + int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); 1.365 + if (U_FAILURE(status)) { 1.366 + return; 1.367 + } 1.368 + if (status == U_USING_DEFAULT_WARNING) { 1.369 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.370 + return; 1.371 + } 1.372 + UnicodeSet tmpSet; 1.373 + int32_t i; 1.374 + for (i=0; i<numScripts; i++) { 1.375 + tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 1.376 + allowedChars->addAll(tmpSet); 1.377 + } 1.378 +} 1.379 + 1.380 + 1.381 +// Convert a text format hex number. Utility function used by builder code. Static. 1.382 +// Input: UChar *string text. Output: a UChar32 1.383 +// Input has been pre-checked, and will have no non-hex chars. 1.384 +// The number must fall in the code point range of 0..0x10ffff 1.385 +// Static Function. 1.386 +UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { 1.387 + if (U_FAILURE(status)) { 1.388 + return 0; 1.389 + } 1.390 + U_ASSERT(limit-start > 0); 1.391 + uint32_t val = 0; 1.392 + int i; 1.393 + for (i=start; i<limit; i++) { 1.394 + int digitVal = s[i] - 0x30; 1.395 + if (digitVal>9) { 1.396 + digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 1.397 + } 1.398 + if (digitVal>15) { 1.399 + digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 1.400 + } 1.401 + U_ASSERT(digitVal <= 0xf); 1.402 + val <<= 4; 1.403 + val += digitVal; 1.404 + } 1.405 + if (val > 0x10ffff) { 1.406 + status = U_PARSE_ERROR; 1.407 + val = 0; 1.408 + } 1.409 + return (UChar32)val; 1.410 +} 1.411 + 1.412 +// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. 1.413 +// Maintain a one-element cache, which is sufficient to avoid repeatedly 1.414 +// creating new ones unless we get multi-thread concurrency in spoof 1.415 +// check operations, which should be statistically uncommon. 1.416 + 1.417 +// These functions are used in place of new & delete of an IdentifierInfo. 1.418 +// They will recycle the IdentifierInfo when possible. 1.419 +// They are logically const, and used within const functions that must be thread safe. 1.420 +IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { 1.421 + IdentifierInfo *returnIdInfo = NULL; 1.422 + if (U_FAILURE(status)) { 1.423 + return returnIdInfo; 1.424 + } 1.425 + SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 1.426 + { 1.427 + Mutex m; 1.428 + returnIdInfo = nonConstThis->fCachedIdentifierInfo; 1.429 + nonConstThis->fCachedIdentifierInfo = NULL; 1.430 + } 1.431 + if (returnIdInfo == NULL) { 1.432 + returnIdInfo = new IdentifierInfo(status); 1.433 + if (U_SUCCESS(status) && returnIdInfo == NULL) { 1.434 + status = U_MEMORY_ALLOCATION_ERROR; 1.435 + } 1.436 + if (U_FAILURE(status) && returnIdInfo != NULL) { 1.437 + delete returnIdInfo; 1.438 + returnIdInfo = NULL; 1.439 + } 1.440 + } 1.441 + return returnIdInfo; 1.442 +} 1.443 + 1.444 + 1.445 +void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { 1.446 + if (idInfo != NULL) { 1.447 + SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 1.448 + { 1.449 + Mutex m; 1.450 + if (nonConstThis->fCachedIdentifierInfo == NULL) { 1.451 + nonConstThis->fCachedIdentifierInfo = idInfo; 1.452 + idInfo = NULL; 1.453 + } 1.454 + } 1.455 + delete idInfo; 1.456 + } 1.457 +} 1.458 + 1.459 + 1.460 + 1.461 + 1.462 +//---------------------------------------------------------------------------------------------- 1.463 +// 1.464 +// class SpoofData Implementation 1.465 +// 1.466 +//---------------------------------------------------------------------------------------------- 1.467 + 1.468 + 1.469 +UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { 1.470 + if (U_FAILURE(status) || 1.471 + rawData == NULL || 1.472 + rawData->fMagic != USPOOF_MAGIC || 1.473 + rawData->fFormatVersion[0] > 1 || 1.474 + rawData->fFormatVersion[1] > 0) { 1.475 + status = U_INVALID_FORMAT_ERROR; 1.476 + return FALSE; 1.477 + } 1.478 + return TRUE; 1.479 +} 1.480 + 1.481 +// 1.482 +// SpoofData::getDefault() - return a wrapper around the spoof data that is 1.483 +// baked into the default ICU data. 1.484 +// 1.485 +SpoofData *SpoofData::getDefault(UErrorCode &status) { 1.486 + // TODO: Cache it. Lazy create, keep until cleanup. 1.487 + 1.488 + UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status); 1.489 + if (U_FAILURE(status)) { 1.490 + return NULL; 1.491 + } 1.492 + SpoofData *This = new SpoofData(udm, status); 1.493 + if (U_FAILURE(status)) { 1.494 + delete This; 1.495 + return NULL; 1.496 + } 1.497 + if (This == NULL) { 1.498 + status = U_MEMORY_ALLOCATION_ERROR; 1.499 + } 1.500 + return This; 1.501 +} 1.502 + 1.503 + 1.504 +SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 1.505 +{ 1.506 + reset(); 1.507 + if (U_FAILURE(status)) { 1.508 + return; 1.509 + } 1.510 + fRawData = reinterpret_cast<SpoofDataHeader *> 1.511 + ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); 1.512 + fUDM = udm; 1.513 + validateDataVersion(fRawData, status); 1.514 + initPtrs(status); 1.515 +} 1.516 + 1.517 + 1.518 +SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 1.519 +{ 1.520 + reset(); 1.521 + if (U_FAILURE(status)) { 1.522 + return; 1.523 + } 1.524 + if ((size_t)length < sizeof(SpoofDataHeader)) { 1.525 + status = U_INVALID_FORMAT_ERROR; 1.526 + return; 1.527 + } 1.528 + void *ncData = const_cast<void *>(data); 1.529 + fRawData = static_cast<SpoofDataHeader *>(ncData); 1.530 + if (length < fRawData->fLength) { 1.531 + status = U_INVALID_FORMAT_ERROR; 1.532 + return; 1.533 + } 1.534 + validateDataVersion(fRawData, status); 1.535 + initPtrs(status); 1.536 +} 1.537 + 1.538 + 1.539 +// Spoof Data constructor for use from data builder. 1.540 +// Initializes a new, empty data area that will be populated later. 1.541 +SpoofData::SpoofData(UErrorCode &status) { 1.542 + reset(); 1.543 + if (U_FAILURE(status)) { 1.544 + return; 1.545 + } 1.546 + fDataOwned = true; 1.547 + fRefCount = 1; 1.548 + 1.549 + // The spoof header should already be sized to be a multiple of 16 bytes. 1.550 + // Just in case it's not, round it up. 1.551 + uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 1.552 + U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 1.553 + 1.554 + fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 1.555 + fMemLimit = initialSize; 1.556 + if (fRawData == NULL) { 1.557 + status = U_MEMORY_ALLOCATION_ERROR; 1.558 + return; 1.559 + } 1.560 + uprv_memset(fRawData, 0, initialSize); 1.561 + 1.562 + fRawData->fMagic = USPOOF_MAGIC; 1.563 + fRawData->fFormatVersion[0] = 1; 1.564 + fRawData->fFormatVersion[1] = 0; 1.565 + fRawData->fFormatVersion[2] = 0; 1.566 + fRawData->fFormatVersion[3] = 0; 1.567 + initPtrs(status); 1.568 +} 1.569 + 1.570 +// reset() - initialize all fields. 1.571 +// Should be updated if any new fields are added. 1.572 +// Called by constructors to put things in a known initial state. 1.573 +void SpoofData::reset() { 1.574 + fRawData = NULL; 1.575 + fDataOwned = FALSE; 1.576 + fUDM = NULL; 1.577 + fMemLimit = 0; 1.578 + fRefCount = 1; 1.579 + fCFUKeys = NULL; 1.580 + fCFUValues = NULL; 1.581 + fCFUStringLengths = NULL; 1.582 + fCFUStrings = NULL; 1.583 + fAnyCaseTrie = NULL; 1.584 + fLowerCaseTrie = NULL; 1.585 + fScriptSets = NULL; 1.586 +} 1.587 + 1.588 + 1.589 +// SpoofData::initPtrs() 1.590 +// Initialize the pointers to the various sections of the raw data. 1.591 +// 1.592 +// This function is used both during the Trie building process (multiple 1.593 +// times, as the individual data sections are added), and 1.594 +// during the opening of a Spoof Checker from prebuilt data. 1.595 +// 1.596 +// The pointers for non-existent data sections (identified by an offset of 0) 1.597 +// are set to NULL. 1.598 +// 1.599 +// Note: During building the data, adding each new data section 1.600 +// reallocs the raw data area, which likely relocates it, which 1.601 +// in turn requires reinitializing all of the pointers into it, hence 1.602 +// multiple calls to this function during building. 1.603 +// 1.604 +void SpoofData::initPtrs(UErrorCode &status) { 1.605 + fCFUKeys = NULL; 1.606 + fCFUValues = NULL; 1.607 + fCFUStringLengths = NULL; 1.608 + fCFUStrings = NULL; 1.609 + if (U_FAILURE(status)) { 1.610 + return; 1.611 + } 1.612 + if (fRawData->fCFUKeys != 0) { 1.613 + fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); 1.614 + } 1.615 + if (fRawData->fCFUStringIndex != 0) { 1.616 + fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); 1.617 + } 1.618 + if (fRawData->fCFUStringLengths != 0) { 1.619 + fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); 1.620 + } 1.621 + if (fRawData->fCFUStringTable != 0) { 1.622 + fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); 1.623 + } 1.624 + 1.625 + if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { 1.626 + fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 1.627 + (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); 1.628 + } 1.629 + if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { 1.630 + fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 1.631 + (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); 1.632 + } 1.633 + 1.634 + if (fRawData->fScriptSets != 0) { 1.635 + fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); 1.636 + } 1.637 +} 1.638 + 1.639 + 1.640 +SpoofData::~SpoofData() { 1.641 + utrie2_close(fAnyCaseTrie); 1.642 + fAnyCaseTrie = NULL; 1.643 + utrie2_close(fLowerCaseTrie); 1.644 + fLowerCaseTrie = NULL; 1.645 + if (fDataOwned) { 1.646 + uprv_free(fRawData); 1.647 + } 1.648 + fRawData = NULL; 1.649 + if (fUDM != NULL) { 1.650 + udata_close(fUDM); 1.651 + } 1.652 + fUDM = NULL; 1.653 +} 1.654 + 1.655 + 1.656 +void SpoofData::removeReference() { 1.657 + if (umtx_atomic_dec(&fRefCount) == 0) { 1.658 + delete this; 1.659 + } 1.660 +} 1.661 + 1.662 + 1.663 +SpoofData *SpoofData::addReference() { 1.664 + umtx_atomic_inc(&fRefCount); 1.665 + return this; 1.666 +} 1.667 + 1.668 + 1.669 +void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 1.670 + if (U_FAILURE(status)) { 1.671 + return NULL; 1.672 + } 1.673 + if (!fDataOwned) { 1.674 + U_ASSERT(FALSE); 1.675 + status = U_INTERNAL_PROGRAM_ERROR; 1.676 + return NULL; 1.677 + } 1.678 + 1.679 + numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 1.680 + uint32_t returnOffset = fMemLimit; 1.681 + fMemLimit += numBytes; 1.682 + fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 1.683 + fRawData->fLength = fMemLimit; 1.684 + uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 1.685 + initPtrs(status); 1.686 + return (char *)fRawData + returnOffset; 1.687 +} 1.688 + 1.689 + 1.690 +U_NAMESPACE_END 1.691 + 1.692 +U_NAMESPACE_USE 1.693 + 1.694 +//----------------------------------------------------------------------------- 1.695 +// 1.696 +// uspoof_swap - byte swap and char encoding swap of spoof data 1.697 +// 1.698 +//----------------------------------------------------------------------------- 1.699 +U_CAPI int32_t U_EXPORT2 1.700 +uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 1.701 + UErrorCode *status) { 1.702 + 1.703 + if (status == NULL || U_FAILURE(*status)) { 1.704 + return 0; 1.705 + } 1.706 + if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 1.707 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.708 + return 0; 1.709 + } 1.710 + 1.711 + // 1.712 + // Check that the data header is for spoof data. 1.713 + // (Header contents are defined in gencfu.cpp) 1.714 + // 1.715 + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 1.716 + if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 1.717 + pInfo->dataFormat[1]==0x66 && 1.718 + pInfo->dataFormat[2]==0x75 && 1.719 + pInfo->dataFormat[3]==0x20 && 1.720 + pInfo->formatVersion[0]==1 )) { 1.721 + udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 1.722 + "(format version %02x %02x %02x %02x) is not recognized\n", 1.723 + pInfo->dataFormat[0], pInfo->dataFormat[1], 1.724 + pInfo->dataFormat[2], pInfo->dataFormat[3], 1.725 + pInfo->formatVersion[0], pInfo->formatVersion[1], 1.726 + pInfo->formatVersion[2], pInfo->formatVersion[3]); 1.727 + *status=U_UNSUPPORTED_ERROR; 1.728 + return 0; 1.729 + } 1.730 + 1.731 + // 1.732 + // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 1.733 + // header). This swap also conveniently gets us 1.734 + // the size of the ICU d.h., which lets us locate the start 1.735 + // of the uspoof specific data. 1.736 + // 1.737 + int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 1.738 + 1.739 + 1.740 + // 1.741 + // Get the Spoof Data Header, and check that it appears to be OK. 1.742 + // 1.743 + // 1.744 + const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 1.745 + SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 1.746 + if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 1.747 + ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 1.748 + { 1.749 + udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 1.750 + *status=U_UNSUPPORTED_ERROR; 1.751 + return 0; 1.752 + } 1.753 + 1.754 + // 1.755 + // Prefight operation? Just return the size 1.756 + // 1.757 + int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 1.758 + int32_t totalSize = headerSize + spoofDataLength; 1.759 + if (length < 0) { 1.760 + return totalSize; 1.761 + } 1.762 + 1.763 + // 1.764 + // Check that length passed in is consistent with length from Spoof data header. 1.765 + // 1.766 + if (length < totalSize) { 1.767 + udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 1.768 + spoofDataLength); 1.769 + *status=U_INDEX_OUTOFBOUNDS_ERROR; 1.770 + return 0; 1.771 + } 1.772 + 1.773 + 1.774 + // 1.775 + // Swap the Data. Do the data itself first, then the Spoof Data Header, because 1.776 + // we need to reference the header to locate the data, and an 1.777 + // inplace swap of the header leaves it unusable. 1.778 + // 1.779 + uint8_t *outBytes = (uint8_t *)outData + headerSize; 1.780 + SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 1.781 + 1.782 + int32_t sectionStart; 1.783 + int32_t sectionLength; 1.784 + 1.785 + // 1.786 + // If not swapping in place, zero out the output buffer before starting. 1.787 + // Gaps may exist between the individual sections, and these must be zeroed in 1.788 + // the output buffer. The simplest way to do that is to just zero the whole thing. 1.789 + // 1.790 + if (inBytes != outBytes) { 1.791 + uprv_memset(outBytes, 0, spoofDataLength); 1.792 + } 1.793 + 1.794 + // Confusables Keys Section (fCFUKeys) 1.795 + sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 1.796 + sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 1.797 + ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.798 + 1.799 + // String Index Section 1.800 + sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 1.801 + sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 1.802 + ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.803 + 1.804 + // String Table Section 1.805 + sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 1.806 + sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 1.807 + ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.808 + 1.809 + // String Lengths Section 1.810 + sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); 1.811 + sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; 1.812 + ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.813 + 1.814 + // Any Case Trie 1.815 + sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); 1.816 + sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); 1.817 + utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.818 + 1.819 + // Lower Case Trie 1.820 + sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); 1.821 + sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); 1.822 + utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.823 + 1.824 + // Script Sets. The data is an array of int32_t 1.825 + sectionStart = ds->readUInt32(spoofDH->fScriptSets); 1.826 + sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); 1.827 + ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 1.828 + 1.829 + // And, last, swap the header itself. 1.830 + // int32_t fMagic // swap this 1.831 + // uint8_t fFormatVersion[4] // Do not swap this, just copy 1.832 + // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 1.833 + // 1.834 + uint32_t magic = ds->readUInt32(spoofDH->fMagic); 1.835 + ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 1.836 + 1.837 + if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { 1.838 + uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 1.839 + } 1.840 + // swap starting at fLength 1.841 + ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 1.842 + 1.843 + return totalSize; 1.844 +} 1.845 + 1.846 +#endif 1.847 + 1.848 +