michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2008-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uspoof.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf16.h" michael@0: #include "utrie2.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "identifier_info.h" michael@0: #include "scriptset.h" michael@0: #include "udatamem.h" michael@0: #include "umutex.h" michael@0: #include "udataswp.h" michael@0: #include "uassert.h" michael@0: #include "uspoof_impl.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) michael@0: michael@0: SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : michael@0: fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , michael@0: fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fSpoofData = data; michael@0: fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; michael@0: michael@0: UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); michael@0: allowedCharsSet->freeze(); michael@0: fAllowedCharsSet = allowedCharsSet; michael@0: fAllowedLocales = uprv_strdup(""); michael@0: if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: fMagic = USPOOF_MAGIC; michael@0: } michael@0: michael@0: michael@0: SpoofImpl::SpoofImpl() : michael@0: fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , michael@0: fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { michael@0: UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); michael@0: allowedCharsSet->freeze(); michael@0: fAllowedCharsSet = allowedCharsSet; michael@0: fAllowedLocales = uprv_strdup(""); michael@0: fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; michael@0: } michael@0: michael@0: michael@0: // Copy Constructor, used by the user level clone() function. michael@0: SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : michael@0: fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , michael@0: fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fMagic = src.fMagic; michael@0: fChecks = src.fChecks; michael@0: if (src.fSpoofData != NULL) { michael@0: fSpoofData = src.fSpoofData->addReference(); michael@0: } michael@0: fAllowedCharsSet = static_cast(src.fAllowedCharsSet->clone()); michael@0: if (fAllowedCharsSet == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: fAllowedLocales = uprv_strdup(src.fAllowedLocales); michael@0: fRestrictionLevel = src.fRestrictionLevel; michael@0: } michael@0: michael@0: SpoofImpl::~SpoofImpl() { michael@0: fMagic = 0; // head off application errors by preventing use of michael@0: // of deleted objects. michael@0: if (fSpoofData != NULL) { michael@0: fSpoofData->removeReference(); // Will delete if refCount goes to zero. michael@0: } michael@0: delete fAllowedCharsSet; michael@0: uprv_free((void *)fAllowedLocales); michael@0: delete fCachedIdentifierInfo; michael@0: } michael@0: michael@0: // michael@0: // Incoming parameter check on Status and the SpoofChecker object michael@0: // received from the C API. michael@0: // michael@0: const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return NULL; michael@0: } michael@0: if (sc == NULL) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: SpoofImpl *This = (SpoofImpl *)sc; michael@0: if (This->fMagic != USPOOF_MAGIC || michael@0: This->fSpoofData == NULL) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: return NULL; michael@0: } michael@0: if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { michael@0: return NULL; michael@0: } michael@0: return This; michael@0: } michael@0: michael@0: SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { michael@0: return const_cast michael@0: (SpoofImpl::validateThis(const_cast(sc), status)); michael@0: } michael@0: michael@0: michael@0: michael@0: //-------------------------------------------------------------------------------------- michael@0: // michael@0: // confusableLookup() This is the heart of the confusable skeleton generation michael@0: // implementation. michael@0: // michael@0: // Given a source character, produce the corresponding michael@0: // replacement character(s), appending them to the dest string. michael@0: // michael@0: //--------------------------------------------------------------------------------------- michael@0: int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { michael@0: michael@0: // Binary search the spoof data key table for the inChar michael@0: int32_t *low = fSpoofData->fCFUKeys; michael@0: int32_t *mid = NULL; michael@0: int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; michael@0: UChar32 midc; michael@0: do { michael@0: int32_t delta = ((int32_t)(limit-low))/2; michael@0: mid = low + delta; michael@0: midc = *mid & 0x1fffff; michael@0: if (inChar == midc) { michael@0: goto foundChar; michael@0: } else if (inChar < midc) { michael@0: limit = mid; michael@0: } else { michael@0: low = mid; michael@0: } michael@0: } while (low < limit-1); michael@0: mid = low; michael@0: midc = *mid & 0x1fffff; michael@0: if (inChar != midc) { michael@0: // Char not found. It maps to itself. michael@0: int i = 0; michael@0: dest.append(inChar); michael@0: return i; michael@0: } michael@0: foundChar: michael@0: int32_t keyFlags = *mid & 0xff000000; michael@0: if ((keyFlags & tableMask) == 0) { michael@0: // We found the right key char, but the entry doesn't pertain to the michael@0: // table we need. See if there is an adjacent key that does michael@0: if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { michael@0: int32_t *altMid; michael@0: for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { michael@0: keyFlags = *altMid & 0xff000000; michael@0: if (keyFlags & tableMask) { michael@0: mid = altMid; michael@0: goto foundKey; michael@0: } michael@0: } michael@0: for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { michael@0: keyFlags = *altMid & 0xff000000; michael@0: if (keyFlags & tableMask) { michael@0: mid = altMid; michael@0: goto foundKey; michael@0: } michael@0: } michael@0: } michael@0: // No key entry for this char & table. michael@0: // The input char maps to itself. michael@0: int i = 0; michael@0: dest.append(inChar); michael@0: return i; michael@0: } michael@0: michael@0: foundKey: michael@0: int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; michael@0: int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); michael@0: michael@0: // Value is either a UChar (for strings of length 1) or michael@0: // an index into the string table (for longer strings) michael@0: uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; michael@0: if (stringLen == 1) { michael@0: dest.append((UChar)value); michael@0: return 1; michael@0: } michael@0: michael@0: // String length of 4 from the above lookup is used for all strings of length >= 4. michael@0: // For these, get the real length from the string lengths table, michael@0: // which maps string table indexes to lengths. michael@0: // All strings of the same length are stored contiguously in the string table. michael@0: // 'value' from the lookup above is the starting index for the desired string. michael@0: michael@0: int32_t ix; michael@0: if (stringLen == 4) { michael@0: int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; michael@0: for (ix = 0; ix < stringLengthsLimit; ix++) { michael@0: if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { michael@0: stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; michael@0: break; michael@0: } michael@0: } michael@0: U_ASSERT(ix < stringLengthsLimit); michael@0: } michael@0: michael@0: U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); michael@0: UChar *src = &fSpoofData->fCFUStrings[value]; michael@0: dest.append(src, stringLen); michael@0: return stringLen; michael@0: } michael@0: michael@0: michael@0: //--------------------------------------------------------------------------------------- michael@0: // michael@0: // wholeScriptCheck() michael@0: // michael@0: // Input text is already normalized to NFD michael@0: // Return the set of scripts, each of which can represent something that is michael@0: // confusable with the input text. The script of the input text michael@0: // is included; input consisting of characters from a single script will michael@0: // always produce a result consisting of a set containing that script. michael@0: // michael@0: //--------------------------------------------------------------------------------------- michael@0: void SpoofImpl::wholeScriptCheck( michael@0: const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { michael@0: michael@0: UTrie2 *table = michael@0: (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; michael@0: result->setAll(); michael@0: int32_t length = text.length(); michael@0: for (int32_t inputIdx=0; inputIdx < length;) { michael@0: UChar32 c = text.char32At(inputIdx); michael@0: inputIdx += U16_LENGTH(c); michael@0: uint32_t index = utrie2_get32(table, c); michael@0: if (index == 0) { michael@0: // No confusables in another script for this char. michael@0: // TODO: we should change the data to have sets with just the single script michael@0: // bit for the script of this char. Gets rid of this special case. michael@0: // Until then, grab the script from the char and intersect it with the set. michael@0: UScriptCode cpScript = uscript_getScript(c, &status); michael@0: U_ASSERT(cpScript > USCRIPT_INHERITED); michael@0: result->intersect(cpScript, status); michael@0: } else if (index == 1) { michael@0: // Script == Common or Inherited. Nothing to do. michael@0: } else { michael@0: result->intersect(fSpoofData->fScriptSets[index]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { michael@0: UnicodeSet allowedChars; michael@0: UnicodeSet *tmpSet = NULL; michael@0: const char *locStart = localesList; michael@0: const char *locEnd = NULL; michael@0: const char *localesListEnd = localesList + uprv_strlen(localesList); michael@0: int32_t localeListCount = 0; // Number of locales provided by caller. michael@0: michael@0: // Loop runs once per locale from the localesList, a comma separated list of locales. michael@0: do { michael@0: locEnd = uprv_strchr(locStart, ','); michael@0: if (locEnd == NULL) { michael@0: locEnd = localesListEnd; michael@0: } michael@0: while (*locStart == ' ') { michael@0: locStart++; michael@0: } michael@0: const char *trimmedEnd = locEnd-1; michael@0: while (trimmedEnd > locStart && *trimmedEnd == ' ') { michael@0: trimmedEnd--; michael@0: } michael@0: if (trimmedEnd <= locStart) { michael@0: break; michael@0: } michael@0: const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); michael@0: localeListCount++; michael@0: michael@0: // We have one locale from the locales list. michael@0: // Add the script chars for this locale to the accumulating set of allowed chars. michael@0: // If the locale is no good, we will be notified back via status. michael@0: addScriptChars(locale, &allowedChars, status); michael@0: uprv_free((void *)locale); michael@0: if (U_FAILURE(status)) { michael@0: break; michael@0: } michael@0: locStart = locEnd + 1; michael@0: } while (locStart < localesListEnd); michael@0: michael@0: // If our caller provided an empty list of locales, we disable the allowed characters checking michael@0: if (localeListCount == 0) { michael@0: uprv_free((void *)fAllowedLocales); michael@0: fAllowedLocales = uprv_strdup(""); michael@0: tmpSet = new UnicodeSet(0, 0x10ffff); michael@0: if (fAllowedLocales == NULL || tmpSet == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: tmpSet->freeze(); michael@0: delete fAllowedCharsSet; michael@0: fAllowedCharsSet = tmpSet; michael@0: fChecks &= ~USPOOF_CHAR_LIMIT; michael@0: return; michael@0: } michael@0: michael@0: michael@0: // Add all common and inherited characters to the set of allowed chars. michael@0: UnicodeSet tempSet; michael@0: tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); michael@0: allowedChars.addAll(tempSet); michael@0: tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); michael@0: allowedChars.addAll(tempSet); michael@0: michael@0: // If anything went wrong, we bail out without changing michael@0: // the state of the spoof checker. michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: michael@0: // Store the updated spoof checker state. michael@0: tmpSet = static_cast(allowedChars.clone()); michael@0: const char *tmpLocalesList = uprv_strdup(localesList); michael@0: if (tmpSet == NULL || tmpLocalesList == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: uprv_free((void *)fAllowedLocales); michael@0: fAllowedLocales = tmpLocalesList; michael@0: tmpSet->freeze(); michael@0: delete fAllowedCharsSet; michael@0: fAllowedCharsSet = tmpSet; michael@0: fChecks |= USPOOF_CHAR_LIMIT; michael@0: } michael@0: michael@0: michael@0: const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { michael@0: return fAllowedLocales; michael@0: } michael@0: michael@0: michael@0: // Given a locale (a language), add all the characters from all of the scripts used with that language michael@0: // to the allowedChars UnicodeSet michael@0: michael@0: void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { michael@0: UScriptCode scripts[30]; michael@0: michael@0: int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if (status == U_USING_DEFAULT_WARNING) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: UnicodeSet tmpSet; michael@0: int32_t i; michael@0: for (i=0; iaddAll(tmpSet); michael@0: } michael@0: } michael@0: michael@0: michael@0: // Convert a text format hex number. Utility function used by builder code. Static. michael@0: // Input: UChar *string text. Output: a UChar32 michael@0: // Input has been pre-checked, and will have no non-hex chars. michael@0: // The number must fall in the code point range of 0..0x10ffff michael@0: // Static Function. michael@0: UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: U_ASSERT(limit-start > 0); michael@0: uint32_t val = 0; michael@0: int i; michael@0: for (i=start; i9) { michael@0: digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' michael@0: } michael@0: if (digitVal>15) { michael@0: digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' michael@0: } michael@0: U_ASSERT(digitVal <= 0xf); michael@0: val <<= 4; michael@0: val += digitVal; michael@0: } michael@0: if (val > 0x10ffff) { michael@0: status = U_PARSE_ERROR; michael@0: val = 0; michael@0: } michael@0: return (UChar32)val; michael@0: } michael@0: michael@0: // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. michael@0: // Maintain a one-element cache, which is sufficient to avoid repeatedly michael@0: // creating new ones unless we get multi-thread concurrency in spoof michael@0: // check operations, which should be statistically uncommon. michael@0: michael@0: // These functions are used in place of new & delete of an IdentifierInfo. michael@0: // They will recycle the IdentifierInfo when possible. michael@0: // They are logically const, and used within const functions that must be thread safe. michael@0: IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { michael@0: IdentifierInfo *returnIdInfo = NULL; michael@0: if (U_FAILURE(status)) { michael@0: return returnIdInfo; michael@0: } michael@0: SpoofImpl *nonConstThis = const_cast(this); michael@0: { michael@0: Mutex m; michael@0: returnIdInfo = nonConstThis->fCachedIdentifierInfo; michael@0: nonConstThis->fCachedIdentifierInfo = NULL; michael@0: } michael@0: if (returnIdInfo == NULL) { michael@0: returnIdInfo = new IdentifierInfo(status); michael@0: if (U_SUCCESS(status) && returnIdInfo == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (U_FAILURE(status) && returnIdInfo != NULL) { michael@0: delete returnIdInfo; michael@0: returnIdInfo = NULL; michael@0: } michael@0: } michael@0: return returnIdInfo; michael@0: } michael@0: michael@0: michael@0: void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { michael@0: if (idInfo != NULL) { michael@0: SpoofImpl *nonConstThis = const_cast(this); michael@0: { michael@0: Mutex m; michael@0: if (nonConstThis->fCachedIdentifierInfo == NULL) { michael@0: nonConstThis->fCachedIdentifierInfo = idInfo; michael@0: idInfo = NULL; michael@0: } michael@0: } michael@0: delete idInfo; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: //---------------------------------------------------------------------------------------------- michael@0: // michael@0: // class SpoofData Implementation michael@0: // michael@0: //---------------------------------------------------------------------------------------------- michael@0: michael@0: michael@0: UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { michael@0: if (U_FAILURE(status) || michael@0: rawData == NULL || michael@0: rawData->fMagic != USPOOF_MAGIC || michael@0: rawData->fFormatVersion[0] > 1 || michael@0: rawData->fFormatVersion[1] > 0) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: return FALSE; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: // michael@0: // SpoofData::getDefault() - return a wrapper around the spoof data that is michael@0: // baked into the default ICU data. michael@0: // michael@0: SpoofData *SpoofData::getDefault(UErrorCode &status) { michael@0: // TODO: Cache it. Lazy create, keep until cleanup. michael@0: michael@0: UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status); michael@0: if (U_FAILURE(status)) { michael@0: return NULL; michael@0: } michael@0: SpoofData *This = new SpoofData(udm, status); michael@0: if (U_FAILURE(status)) { michael@0: delete This; michael@0: return NULL; michael@0: } michael@0: if (This == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return This; michael@0: } michael@0: michael@0: michael@0: SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) michael@0: { michael@0: reset(); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fRawData = reinterpret_cast michael@0: ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); michael@0: fUDM = udm; michael@0: validateDataVersion(fRawData, status); michael@0: initPtrs(status); michael@0: } michael@0: michael@0: michael@0: SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) michael@0: { michael@0: reset(); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if ((size_t)length < sizeof(SpoofDataHeader)) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: return; michael@0: } michael@0: void *ncData = const_cast(data); michael@0: fRawData = static_cast(ncData); michael@0: if (length < fRawData->fLength) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: return; michael@0: } michael@0: validateDataVersion(fRawData, status); michael@0: initPtrs(status); michael@0: } michael@0: michael@0: michael@0: // Spoof Data constructor for use from data builder. michael@0: // Initializes a new, empty data area that will be populated later. michael@0: SpoofData::SpoofData(UErrorCode &status) { michael@0: reset(); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fDataOwned = true; michael@0: fRefCount = 1; michael@0: michael@0: // The spoof header should already be sized to be a multiple of 16 bytes. michael@0: // Just in case it's not, round it up. michael@0: uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; michael@0: U_ASSERT(initialSize == sizeof(SpoofDataHeader)); michael@0: michael@0: fRawData = static_cast(uprv_malloc(initialSize)); michael@0: fMemLimit = initialSize; michael@0: if (fRawData == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: uprv_memset(fRawData, 0, initialSize); michael@0: michael@0: fRawData->fMagic = USPOOF_MAGIC; michael@0: fRawData->fFormatVersion[0] = 1; michael@0: fRawData->fFormatVersion[1] = 0; michael@0: fRawData->fFormatVersion[2] = 0; michael@0: fRawData->fFormatVersion[3] = 0; michael@0: initPtrs(status); michael@0: } michael@0: michael@0: // reset() - initialize all fields. michael@0: // Should be updated if any new fields are added. michael@0: // Called by constructors to put things in a known initial state. michael@0: void SpoofData::reset() { michael@0: fRawData = NULL; michael@0: fDataOwned = FALSE; michael@0: fUDM = NULL; michael@0: fMemLimit = 0; michael@0: fRefCount = 1; michael@0: fCFUKeys = NULL; michael@0: fCFUValues = NULL; michael@0: fCFUStringLengths = NULL; michael@0: fCFUStrings = NULL; michael@0: fAnyCaseTrie = NULL; michael@0: fLowerCaseTrie = NULL; michael@0: fScriptSets = NULL; michael@0: } michael@0: michael@0: michael@0: // SpoofData::initPtrs() michael@0: // Initialize the pointers to the various sections of the raw data. michael@0: // michael@0: // This function is used both during the Trie building process (multiple michael@0: // times, as the individual data sections are added), and michael@0: // during the opening of a Spoof Checker from prebuilt data. michael@0: // michael@0: // The pointers for non-existent data sections (identified by an offset of 0) michael@0: // are set to NULL. michael@0: // michael@0: // Note: During building the data, adding each new data section michael@0: // reallocs the raw data area, which likely relocates it, which michael@0: // in turn requires reinitializing all of the pointers into it, hence michael@0: // multiple calls to this function during building. michael@0: // michael@0: void SpoofData::initPtrs(UErrorCode &status) { michael@0: fCFUKeys = NULL; michael@0: fCFUValues = NULL; michael@0: fCFUStringLengths = NULL; michael@0: fCFUStrings = NULL; michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if (fRawData->fCFUKeys != 0) { michael@0: fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); michael@0: } michael@0: if (fRawData->fCFUStringIndex != 0) { michael@0: fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); michael@0: } michael@0: if (fRawData->fCFUStringLengths != 0) { michael@0: fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); michael@0: } michael@0: if (fRawData->fCFUStringTable != 0) { michael@0: fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); michael@0: } michael@0: michael@0: if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { michael@0: fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, michael@0: (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); michael@0: } michael@0: if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { michael@0: fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, michael@0: (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); michael@0: } michael@0: michael@0: if (fRawData->fScriptSets != 0) { michael@0: fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); michael@0: } michael@0: } michael@0: michael@0: michael@0: SpoofData::~SpoofData() { michael@0: utrie2_close(fAnyCaseTrie); michael@0: fAnyCaseTrie = NULL; michael@0: utrie2_close(fLowerCaseTrie); michael@0: fLowerCaseTrie = NULL; michael@0: if (fDataOwned) { michael@0: uprv_free(fRawData); michael@0: } michael@0: fRawData = NULL; michael@0: if (fUDM != NULL) { michael@0: udata_close(fUDM); michael@0: } michael@0: fUDM = NULL; michael@0: } michael@0: michael@0: michael@0: void SpoofData::removeReference() { michael@0: if (umtx_atomic_dec(&fRefCount) == 0) { michael@0: delete this; michael@0: } michael@0: } michael@0: michael@0: michael@0: SpoofData *SpoofData::addReference() { michael@0: umtx_atomic_inc(&fRefCount); michael@0: return this; michael@0: } michael@0: michael@0: michael@0: void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return NULL; michael@0: } michael@0: if (!fDataOwned) { michael@0: U_ASSERT(FALSE); michael@0: status = U_INTERNAL_PROGRAM_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 michael@0: uint32_t returnOffset = fMemLimit; michael@0: fMemLimit += numBytes; michael@0: fRawData = static_cast(uprv_realloc(fRawData, fMemLimit)); michael@0: fRawData->fLength = fMemLimit; michael@0: uprv_memset((char *)fRawData + returnOffset, 0, numBytes); michael@0: initPtrs(status); michael@0: return (char *)fRawData + returnOffset; michael@0: } michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // uspoof_swap - byte swap and char encoding swap of spoof data michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, michael@0: UErrorCode *status) { michael@0: michael@0: if (status == NULL || U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { michael@0: *status=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Check that the data header is for spoof data. michael@0: // (Header contents are defined in gencfu.cpp) michael@0: // michael@0: const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); michael@0: if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ michael@0: pInfo->dataFormat[1]==0x66 && michael@0: pInfo->dataFormat[2]==0x75 && michael@0: pInfo->dataFormat[3]==0x20 && michael@0: pInfo->formatVersion[0]==1 )) { michael@0: udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " michael@0: "(format version %02x %02x %02x %02x) is not recognized\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], michael@0: pInfo->dataFormat[2], pInfo->dataFormat[3], michael@0: pInfo->formatVersion[0], pInfo->formatVersion[1], michael@0: pInfo->formatVersion[2], pInfo->formatVersion[3]); michael@0: *status=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific michael@0: // header). This swap also conveniently gets us michael@0: // the size of the ICU d.h., which lets us locate the start michael@0: // of the uspoof specific data. michael@0: // michael@0: int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); michael@0: michael@0: michael@0: // michael@0: // Get the Spoof Data Header, and check that it appears to be OK. michael@0: // michael@0: // michael@0: const uint8_t *inBytes =(const uint8_t *)inData+headerSize; michael@0: SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; michael@0: if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || michael@0: ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) michael@0: { michael@0: udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); michael@0: *status=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: // michael@0: // Prefight operation? Just return the size michael@0: // michael@0: int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); michael@0: int32_t totalSize = headerSize + spoofDataLength; michael@0: if (length < 0) { michael@0: return totalSize; michael@0: } michael@0: michael@0: // michael@0: // Check that length passed in is consistent with length from Spoof data header. michael@0: // michael@0: if (length < totalSize) { michael@0: udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", michael@0: spoofDataLength); michael@0: *status=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: michael@0: // michael@0: // Swap the Data. Do the data itself first, then the Spoof Data Header, because michael@0: // we need to reference the header to locate the data, and an michael@0: // inplace swap of the header leaves it unusable. michael@0: // michael@0: uint8_t *outBytes = (uint8_t *)outData + headerSize; michael@0: SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; michael@0: michael@0: int32_t sectionStart; michael@0: int32_t sectionLength; michael@0: michael@0: // michael@0: // If not swapping in place, zero out the output buffer before starting. michael@0: // Gaps may exist between the individual sections, and these must be zeroed in michael@0: // the output buffer. The simplest way to do that is to just zero the whole thing. michael@0: // michael@0: if (inBytes != outBytes) { michael@0: uprv_memset(outBytes, 0, spoofDataLength); michael@0: } michael@0: michael@0: // Confusables Keys Section (fCFUKeys) michael@0: sectionStart = ds->readUInt32(spoofDH->fCFUKeys); michael@0: sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; michael@0: ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // String Index Section michael@0: sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); michael@0: sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; michael@0: ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // String Table Section michael@0: sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); michael@0: sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; michael@0: ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // String Lengths Section michael@0: sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); michael@0: sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; michael@0: ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // Any Case Trie michael@0: sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); michael@0: sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); michael@0: utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // Lower Case Trie michael@0: sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); michael@0: sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); michael@0: utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // Script Sets. The data is an array of int32_t michael@0: sectionStart = ds->readUInt32(spoofDH->fScriptSets); michael@0: sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); michael@0: ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); michael@0: michael@0: // And, last, swap the header itself. michael@0: // int32_t fMagic // swap this michael@0: // uint8_t fFormatVersion[4] // Do not swap this, just copy michael@0: // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. michael@0: // michael@0: uint32_t magic = ds->readUInt32(spoofDH->fMagic); michael@0: ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); michael@0: michael@0: if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { michael@0: uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); michael@0: } michael@0: // swap starting at fLength michael@0: ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); michael@0: michael@0: return totalSize; michael@0: } michael@0: michael@0: #endif michael@0: michael@0: