michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2008-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: uspoof_conf.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009Jan05 (refactoring earlier files) michael@0: * created by: Andy Heninger michael@0: * michael@0: * Internal classes for compililing confusable data into its binary (runtime) form. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uspoof.h" michael@0: #if !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/uregex.h" michael@0: #include "unicode/ustring.h" michael@0: #include "cmemory.h" michael@0: #include "uspoof_impl.h" michael@0: #include "uhash.h" michael@0: #include "uvector.h" michael@0: #include "uassert.h" michael@0: #include "uarrsort.h" michael@0: #include "uspoof_conf.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: michael@0: //--------------------------------------------------------------------- michael@0: // michael@0: // buildConfusableData Compile the source confusable data, as defined by michael@0: // the Unicode data file confusables.txt, into the binary michael@0: // structures used by the confusable detector. michael@0: // michael@0: // The binary structures are described in uspoof_impl.h michael@0: // michael@0: // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA michael@0: // tables. Each maps from a UChar32 to a String. michael@0: // michael@0: // 2. Sort all of the strings encountered by length, since they will need to michael@0: // be stored in that order in the final string table. michael@0: // michael@0: // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the michael@0: // list because that will be the ordering of our runtime table. michael@0: // michael@0: // 4. Generate the run time string table. This is generated before the key & value michael@0: // tables because we need the string indexes when building those tables. michael@0: // michael@0: // 5. Build the run-time key and value tables. These are parallel tables, and are built michael@0: // at the same time michael@0: // michael@0: michael@0: SPUString::SPUString(UnicodeString *s) { michael@0: fStr = s; michael@0: fStrTableIndex = 0; michael@0: } michael@0: michael@0: michael@0: SPUString::~SPUString() { michael@0: delete fStr; michael@0: } michael@0: michael@0: michael@0: SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) { michael@0: fVec = new UVector(status); michael@0: fHash = uhash_open(uhash_hashUnicodeString, // key hash function michael@0: uhash_compareUnicodeString, // Key Comparator michael@0: NULL, // Value Comparator michael@0: &status); michael@0: } michael@0: michael@0: michael@0: SPUStringPool::~SPUStringPool() { michael@0: int i; michael@0: for (i=fVec->size()-1; i>=0; i--) { michael@0: SPUString *s = static_cast(fVec->elementAt(i)); michael@0: delete s; michael@0: } michael@0: delete fVec; michael@0: uhash_close(fHash); michael@0: } michael@0: michael@0: michael@0: int32_t SPUStringPool::size() { michael@0: return fVec->size(); michael@0: } michael@0: michael@0: SPUString *SPUStringPool::getByIndex(int32_t index) { michael@0: SPUString *retString = (SPUString *)fVec->elementAt(index); michael@0: return retString; michael@0: } michael@0: michael@0: michael@0: // Comparison function for ordering strings in the string pool. michael@0: // Compare by length first, then, within a group of the same length, michael@0: // by code point order. michael@0: // Conforms to the type signature for a USortComparator in uvector.h michael@0: michael@0: static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { michael@0: const SPUString *sL = const_cast( michael@0: static_cast(left.pointer)); michael@0: const SPUString *sR = const_cast( michael@0: static_cast(right.pointer)); michael@0: int32_t lenL = sL->fStr->length(); michael@0: int32_t lenR = sR->fStr->length(); michael@0: if (lenL < lenR) { michael@0: return -1; michael@0: } else if (lenL > lenR) { michael@0: return 1; michael@0: } else { michael@0: return sL->fStr->compare(*(sR->fStr)); michael@0: } michael@0: } michael@0: michael@0: void SPUStringPool::sort(UErrorCode &status) { michael@0: fVec->sort(SPUStringCompare, status); michael@0: } michael@0: michael@0: michael@0: SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { michael@0: SPUString *hashedString = static_cast(uhash_get(fHash, src)); michael@0: if (hashedString != NULL) { michael@0: delete src; michael@0: } else { michael@0: hashedString = new SPUString(src); michael@0: uhash_put(fHash, src, hashedString, &status); michael@0: fVec->addElement(hashedString, status); michael@0: } michael@0: return hashedString; michael@0: } michael@0: michael@0: michael@0: michael@0: ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : michael@0: fSpoofImpl(spImpl), michael@0: fInput(NULL), michael@0: fSLTable(NULL), michael@0: fSATable(NULL), michael@0: fMLTable(NULL), michael@0: fMATable(NULL), michael@0: fKeySet(NULL), michael@0: fKeyVec(NULL), michael@0: fValueVec(NULL), michael@0: fStringTable(NULL), michael@0: fStringLengthsTable(NULL), michael@0: stringPool(NULL), michael@0: fParseLine(NULL), michael@0: fParseHexNum(NULL), michael@0: fLineNum(0) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); michael@0: fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); michael@0: fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); michael@0: fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); michael@0: fKeySet = new UnicodeSet(); michael@0: fKeyVec = new UVector(status); michael@0: fValueVec = new UVector(status); michael@0: stringPool = new SPUStringPool(status); michael@0: } michael@0: michael@0: michael@0: ConfusabledataBuilder::~ConfusabledataBuilder() { michael@0: uprv_free(fInput); michael@0: uregex_close(fParseLine); michael@0: uregex_close(fParseHexNum); michael@0: uhash_close(fSLTable); michael@0: uhash_close(fSATable); michael@0: uhash_close(fMLTable); michael@0: uhash_close(fMATable); michael@0: delete fKeySet; michael@0: delete fKeyVec; michael@0: delete fStringTable; michael@0: delete fStringLengthsTable; michael@0: delete fValueVec; michael@0: delete stringPool; michael@0: } michael@0: michael@0: michael@0: void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, michael@0: int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { michael@0: michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: ConfusabledataBuilder builder(spImpl, status); michael@0: builder.build(confusables, confusablesLen, status); michael@0: if (U_FAILURE(status) && errorType != NULL) { michael@0: *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; michael@0: pe->line = builder.fLineNum; michael@0: } michael@0: } michael@0: michael@0: michael@0: void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, michael@0: UErrorCode &status) { michael@0: michael@0: // Convert the user input data from UTF-8 to UChar (UTF-16) michael@0: int32_t inputLen = 0; michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); michael@0: if (status != U_BUFFER_OVERFLOW_ERROR) { michael@0: return; michael@0: } michael@0: status = U_ZERO_ERROR; michael@0: fInput = static_cast(uprv_malloc((inputLen+1) * sizeof(UChar))); michael@0: if (fInput == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); michael@0: michael@0: michael@0: // Regular Expression to parse a line from Confusables.txt. The expression will match michael@0: // any line. What was matched is determined by examining which capture groups have a match. michael@0: // Capture Group 1: the source char michael@0: // Capture Group 2: the replacement chars michael@0: // Capture Group 3-6 the table type, SL, SA, ML, or MA michael@0: // Capture Group 7: A blank or comment only line. michael@0: // Capture Group 8: A syntactically invalid line. Anything that didn't match before. michael@0: // Example Line from the confusables.txt source file: michael@0: // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " michael@0: UnicodeString pattern( michael@0: "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char michael@0: "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) michael@0: "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) michael@0: "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type michael@0: "[ \\t]*(?:#.*?)?$" // Match any trailing #comment michael@0: "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment michael@0: "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. michael@0: // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... michael@0: fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); michael@0: michael@0: // Regular expression for parsing a hex number out of a space-separated list of them. michael@0: // Capture group 1 gets the number, with spaces removed. michael@0: pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); michael@0: fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); michael@0: michael@0: // Zap any Byte Order Mark at the start of input. Changing it to a space is benign michael@0: // given the syntax of the input. michael@0: if (*fInput == 0xfeff) { michael@0: *fInput = 0x20; michael@0: } michael@0: michael@0: // Parse the input, one line per iteration of this loop. michael@0: uregex_setText(fParseLine, fInput, inputLen, &status); michael@0: while (uregex_findNext(fParseLine, &status)) { michael@0: fLineNum++; michael@0: if (uregex_start(fParseLine, 7, &status) >= 0) { michael@0: // this was a blank or comment line. michael@0: continue; michael@0: } michael@0: if (uregex_start(fParseLine, 8, &status) >= 0) { michael@0: // input file syntax error. michael@0: status = U_PARSE_ERROR; michael@0: return; michael@0: } michael@0: michael@0: // We have a good input line. Extract the key character and mapping string, and michael@0: // put them into the appropriate mapping table. michael@0: UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), michael@0: uregex_end(fParseLine, 1, &status), status); michael@0: michael@0: int32_t mapStringStart = uregex_start(fParseLine, 2, &status); michael@0: int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; michael@0: uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); michael@0: michael@0: UnicodeString *mapString = new UnicodeString(); michael@0: if (mapString == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: while (uregex_findNext(fParseHexNum, &status)) { michael@0: UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), michael@0: uregex_end(fParseHexNum, 1, &status), status); michael@0: mapString->append(c); michael@0: } michael@0: U_ASSERT(mapString->length() >= 1); michael@0: michael@0: // Put the map (value) string into the string pool michael@0: // This a little like a Java intern() - any duplicates will be eliminated. michael@0: SPUString *smapString = stringPool->addString(mapString, status); michael@0: michael@0: // Add the UChar32 -> string mapping to the appropriate table. michael@0: UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable : michael@0: uregex_start(fParseLine, 4, &status) >= 0 ? fSATable : michael@0: uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable : michael@0: uregex_start(fParseLine, 6, &status) >= 0 ? fMATable : michael@0: NULL; michael@0: U_ASSERT(table != NULL); michael@0: uhash_iput(table, keyChar, smapString, &status); michael@0: fKeySet->add(keyChar); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: // Input data is now all parsed and collected. michael@0: // Now create the run-time binary form of the data. michael@0: // michael@0: // This is done in two steps. First the data is assembled into vectors and strings, michael@0: // for ease of construction, then the contents of these collections are dumped michael@0: // into the actual raw-bytes data storage. michael@0: michael@0: // Build up the string array, and record the index of each string therein michael@0: // in the (build time only) string pool. michael@0: // Strings of length one are not entered into the strings array. michael@0: // At the same time, build up the string lengths table, which records the michael@0: // position in the string table of the first string of each length >= 4. michael@0: // (Strings in the table are sorted by length) michael@0: stringPool->sort(status); michael@0: fStringTable = new UnicodeString(); michael@0: fStringLengthsTable = new UVector(status); michael@0: int32_t previousStringLength = 0; michael@0: int32_t previousStringIndex = 0; michael@0: int32_t poolSize = stringPool->size(); michael@0: int32_t i; michael@0: for (i=0; igetByIndex(i); michael@0: int32_t strLen = s->fStr->length(); michael@0: int32_t strIndex = fStringTable->length(); michael@0: U_ASSERT(strLen >= previousStringLength); michael@0: if (strLen == 1) { michael@0: // strings of length one do not get an entry in the string table. michael@0: // Keep the single string character itself here, which is the same michael@0: // convention that is used in the final run-time string table index. michael@0: s->fStrTableIndex = s->fStr->charAt(0); michael@0: } else { michael@0: if ((strLen > previousStringLength) && (previousStringLength >= 4)) { michael@0: fStringLengthsTable->addElement(previousStringIndex, status); michael@0: fStringLengthsTable->addElement(previousStringLength, status); michael@0: } michael@0: s->fStrTableIndex = strIndex; michael@0: fStringTable->append(*(s->fStr)); michael@0: } michael@0: previousStringLength = strLen; michael@0: previousStringIndex = strIndex; michael@0: } michael@0: // Make the final entry to the string lengths table. michael@0: // (it holds an entry for the _last_ string of each length, so adding the michael@0: // final one doesn't happen in the main loop because no longer string was encountered.) michael@0: if (previousStringLength >= 4) { michael@0: fStringLengthsTable->addElement(previousStringIndex, status); michael@0: fStringLengthsTable->addElement(previousStringLength, status); michael@0: } michael@0: michael@0: // Construct the compile-time Key and Value tables michael@0: // michael@0: // For each key code point, check which mapping tables it applies to, michael@0: // and create the final data for the key & value structures. michael@0: // michael@0: // The four logical mapping tables are conflated into one combined table. michael@0: // If multiple logical tables have the same mapping for some key, they michael@0: // share a single entry in the combined table. michael@0: // If more than one mapping exists for the same key code point, multiple michael@0: // entries will be created in the table michael@0: michael@0: for (int32_t range=0; rangegetRangeCount(); range++) { michael@0: // It is an oddity of the UnicodeSet API that simply enumerating the contained michael@0: // code points requires a nested loop. michael@0: for (UChar32 keyChar=fKeySet->getRangeStart(range); michael@0: keyChar <= fKeySet->getRangeEnd(range); keyChar++) { michael@0: addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status); michael@0: addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status); michael@0: addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status); michael@0: addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status); michael@0: } michael@0: } michael@0: michael@0: // Put the assembled data into the flat runtime array michael@0: outputData(status); michael@0: michael@0: // All of the intermediate allocated data belongs to the ConfusabledataBuilder michael@0: // object (this), and is deleted in the destructor. michael@0: return; michael@0: } michael@0: michael@0: // michael@0: // outputData The confusable data has been compiled and stored in intermediate michael@0: // collections and strings. Copy it from there to the final flat michael@0: // binary array. michael@0: // michael@0: // Note that as each section is added to the output data, the michael@0: // expand (reserveSpace() function will likely relocate it in memory. michael@0: // Be careful with pointers. michael@0: // michael@0: void ConfusabledataBuilder::outputData(UErrorCode &status) { michael@0: michael@0: U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE); michael@0: michael@0: // The Key Table michael@0: // While copying the keys to the runtime array, michael@0: // also sanity check that they are sorted. michael@0: michael@0: int32_t numKeys = fKeyVec->size(); michael@0: int32_t *keys = michael@0: static_cast(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: int i; michael@0: int32_t previousKey = 0; michael@0: for (i=0; ielementAti(i); michael@0: (void)previousKey; // Suppress unused variable warning on gcc. michael@0: U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff)); michael@0: U_ASSERT((key & 0xff000000) != 0); michael@0: keys[i] = key; michael@0: previousKey = key; michael@0: } michael@0: SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; michael@0: rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData); michael@0: rawData->fCFUKeysSize = numKeys; michael@0: fSpoofImpl->fSpoofData->fCFUKeys = keys; michael@0: michael@0: michael@0: // The Value Table, parallels the key table michael@0: int32_t numValues = fValueVec->size(); michael@0: U_ASSERT(numKeys == numValues); michael@0: uint16_t *values = michael@0: static_cast(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: for (i=0; i(fValueVec->elementAti(i)); michael@0: U_ASSERT(value < 0xffff); michael@0: values[i] = static_cast(value); michael@0: } michael@0: rawData = fSpoofImpl->fSpoofData->fRawData; michael@0: rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData); michael@0: rawData->fCFUStringIndexSize = numValues; michael@0: fSpoofImpl->fSpoofData->fCFUValues = values; michael@0: michael@0: // The Strings Table. michael@0: michael@0: uint32_t stringsLength = fStringTable->length(); michael@0: // Reserve an extra space so the string will be nul-terminated. This is michael@0: // only a convenience, for when debugging; it is not needed otherwise. michael@0: UChar *strings = michael@0: static_cast(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status)); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: fStringTable->extract(strings, stringsLength+1, status); michael@0: rawData = fSpoofImpl->fSpoofData->fRawData; michael@0: U_ASSERT(rawData->fCFUStringTable == 0); michael@0: rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData); michael@0: rawData->fCFUStringTableLen = stringsLength; michael@0: fSpoofImpl->fSpoofData->fCFUStrings = strings; michael@0: michael@0: // The String Lengths Table michael@0: // While copying into the runtime array do some sanity checks on the values michael@0: // Each complete entry contains two fields, an index and an offset. michael@0: // Lengths should increase with each entry. michael@0: // Offsets should be less than the size of the string table. michael@0: int32_t lengthTableLength = fStringLengthsTable->size(); michael@0: uint16_t *stringLengths = michael@0: static_cast(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status)); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: int32_t destIndex = 0; michael@0: uint32_t previousLength = 0; michael@0: for (i=0; i(fStringLengthsTable->elementAti(i)); michael@0: uint32_t length = static_cast(fStringLengthsTable->elementAti(i+1)); michael@0: U_ASSERT(offset < stringsLength); michael@0: U_ASSERT(length < 40); michael@0: (void)previousLength; // Suppress unused variable warning on gcc. michael@0: U_ASSERT(length > previousLength); michael@0: stringLengths[destIndex++] = static_cast(offset); michael@0: stringLengths[destIndex++] = static_cast(length); michael@0: previousLength = length; michael@0: } michael@0: rawData = fSpoofImpl->fSpoofData->fRawData; michael@0: rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData); michael@0: // Note: StringLengthsSize in the raw data is the number of complete entries, michael@0: // each consisting of a pair of 16 bit values, hence the divide by 2. michael@0: rawData->fCFUStringLengthsSize = lengthTableLength / 2; michael@0: fSpoofImpl->fSpoofData->fCFUStringLengths = michael@0: reinterpret_cast(stringLengths); michael@0: } michael@0: michael@0: michael@0: michael@0: // addKeyEntry Construction of the confusable Key and Mapping Values tables. michael@0: // This is an intermediate point in the building process. michael@0: // We already have the mappings in the hash tables fSLTable, etc. michael@0: // This function builds corresponding run-time style table entries into michael@0: // fKeyVec and fValueVec michael@0: michael@0: void ConfusabledataBuilder::addKeyEntry( michael@0: UChar32 keyChar, // The key character michael@0: UHashtable *table, // The table, one of SATable, MATable, etc. michael@0: int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc. michael@0: UErrorCode &status) { michael@0: michael@0: SPUString *targetMapping = static_cast(uhash_iget(table, keyChar)); michael@0: if (targetMapping == NULL) { michael@0: // No mapping for this key character. michael@0: // (This function is called for all four tables for each key char that michael@0: // is seen anywhere, so this no entry cases are very much expected.) michael@0: return; michael@0: } michael@0: michael@0: // Check whether there is already an entry with the correct mapping. michael@0: // If so, simply set the flag in the keyTable saying that the existing entry michael@0: // applies to the table that we're doing now. michael@0: michael@0: UBool keyHasMultipleValues = FALSE; michael@0: int32_t i; michael@0: for (i=fKeyVec->size()-1; i>=0 ; i--) { michael@0: int32_t key = fKeyVec->elementAti(i); michael@0: if ((key & 0x0ffffff) != keyChar) { michael@0: // We have now checked all existing key entries for this key char (if any) michael@0: // without finding one with the same mapping. michael@0: break; michael@0: } michael@0: UnicodeString mapping = getMapping(i); michael@0: if (mapping == *(targetMapping->fStr)) { michael@0: // The run time entry we are currently testing has the correct mapping. michael@0: // Set the flag in it indicating that it applies to the new table also. michael@0: key |= tableFlag; michael@0: fKeyVec->setElementAt(key, i); michael@0: return; michael@0: } michael@0: keyHasMultipleValues = TRUE; michael@0: } michael@0: michael@0: // Need to add a new entry to the binary data being built for this mapping. michael@0: // Includes adding entries to both the key table and the parallel values table. michael@0: michael@0: int32_t newKey = keyChar | tableFlag; michael@0: if (keyHasMultipleValues) { michael@0: newKey |= USPOOF_KEY_MULTIPLE_VALUES; michael@0: } michael@0: int32_t adjustedMappingLength = targetMapping->fStr->length() - 1; michael@0: if (adjustedMappingLength>3) { michael@0: adjustedMappingLength = 3; michael@0: } michael@0: newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT; michael@0: michael@0: int32_t newData = targetMapping->fStrTableIndex; michael@0: michael@0: fKeyVec->addElement(newKey, status); michael@0: fValueVec->addElement(newData, status); michael@0: michael@0: // If the preceding key entry is for the same key character (but with a different mapping) michael@0: // set the multiple-values flag on it. michael@0: if (keyHasMultipleValues) { michael@0: int32_t previousKeyIndex = fKeyVec->size() - 2; michael@0: int32_t previousKey = fKeyVec->elementAti(previousKeyIndex); michael@0: previousKey |= USPOOF_KEY_MULTIPLE_VALUES; michael@0: fKeyVec->setElementAt(previousKey, previousKeyIndex); michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: UnicodeString ConfusabledataBuilder::getMapping(int32_t index) { michael@0: int32_t key = fKeyVec->elementAti(index); michael@0: int32_t value = fValueVec->elementAti(index); michael@0: int32_t length = USPOOF_KEY_LENGTH_FIELD(key); michael@0: int32_t lastIndexWithLen; michael@0: switch (length) { michael@0: case 0: michael@0: return UnicodeString(static_cast(value)); michael@0: case 1: michael@0: case 2: michael@0: return UnicodeString(*fStringTable, value, length+1); michael@0: case 3: michael@0: length = 0; michael@0: int32_t i; michael@0: for (i=0; isize(); i+=2) { michael@0: lastIndexWithLen = fStringLengthsTable->elementAti(i); michael@0: if (value <= lastIndexWithLen) { michael@0: length = fStringLengthsTable->elementAti(i+1); michael@0: break; michael@0: } michael@0: } michael@0: U_ASSERT(length>=3); michael@0: return UnicodeString(*fStringTable, value, length); michael@0: default: michael@0: U_ASSERT(FALSE); michael@0: } michael@0: return UnicodeString(); michael@0: } michael@0: michael@0: #endif michael@0: #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: