1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/uspoof_conf.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,605 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2008-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: uspoof_conf.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009Jan05 (refactoring earlier files) 1.17 +* created by: Andy Heninger 1.18 +* 1.19 +* Internal classes for compililing confusable data into its binary (runtime) form. 1.20 +*/ 1.21 + 1.22 +#include "unicode/utypes.h" 1.23 +#include "unicode/uspoof.h" 1.24 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.25 +#if !UCONFIG_NO_NORMALIZATION 1.26 + 1.27 +#include "unicode/unorm.h" 1.28 +#include "unicode/uregex.h" 1.29 +#include "unicode/ustring.h" 1.30 +#include "cmemory.h" 1.31 +#include "uspoof_impl.h" 1.32 +#include "uhash.h" 1.33 +#include "uvector.h" 1.34 +#include "uassert.h" 1.35 +#include "uarrsort.h" 1.36 +#include "uspoof_conf.h" 1.37 + 1.38 +U_NAMESPACE_USE 1.39 + 1.40 + 1.41 +//--------------------------------------------------------------------- 1.42 +// 1.43 +// buildConfusableData Compile the source confusable data, as defined by 1.44 +// the Unicode data file confusables.txt, into the binary 1.45 +// structures used by the confusable detector. 1.46 +// 1.47 +// The binary structures are described in uspoof_impl.h 1.48 +// 1.49 +// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA 1.50 +// tables. Each maps from a UChar32 to a String. 1.51 +// 1.52 +// 2. Sort all of the strings encountered by length, since they will need to 1.53 +// be stored in that order in the final string table. 1.54 +// 1.55 +// 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the 1.56 +// list because that will be the ordering of our runtime table. 1.57 +// 1.58 +// 4. Generate the run time string table. This is generated before the key & value 1.59 +// tables because we need the string indexes when building those tables. 1.60 +// 1.61 +// 5. Build the run-time key and value tables. These are parallel tables, and are built 1.62 +// at the same time 1.63 +// 1.64 + 1.65 +SPUString::SPUString(UnicodeString *s) { 1.66 + fStr = s; 1.67 + fStrTableIndex = 0; 1.68 +} 1.69 + 1.70 + 1.71 +SPUString::~SPUString() { 1.72 + delete fStr; 1.73 +} 1.74 + 1.75 + 1.76 +SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) { 1.77 + fVec = new UVector(status); 1.78 + fHash = uhash_open(uhash_hashUnicodeString, // key hash function 1.79 + uhash_compareUnicodeString, // Key Comparator 1.80 + NULL, // Value Comparator 1.81 + &status); 1.82 +} 1.83 + 1.84 + 1.85 +SPUStringPool::~SPUStringPool() { 1.86 + int i; 1.87 + for (i=fVec->size()-1; i>=0; i--) { 1.88 + SPUString *s = static_cast<SPUString *>(fVec->elementAt(i)); 1.89 + delete s; 1.90 + } 1.91 + delete fVec; 1.92 + uhash_close(fHash); 1.93 +} 1.94 + 1.95 + 1.96 +int32_t SPUStringPool::size() { 1.97 + return fVec->size(); 1.98 +} 1.99 + 1.100 +SPUString *SPUStringPool::getByIndex(int32_t index) { 1.101 + SPUString *retString = (SPUString *)fVec->elementAt(index); 1.102 + return retString; 1.103 +} 1.104 + 1.105 + 1.106 +// Comparison function for ordering strings in the string pool. 1.107 +// Compare by length first, then, within a group of the same length, 1.108 +// by code point order. 1.109 +// Conforms to the type signature for a USortComparator in uvector.h 1.110 + 1.111 +static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { 1.112 + const SPUString *sL = const_cast<const SPUString *>( 1.113 + static_cast<SPUString *>(left.pointer)); 1.114 + const SPUString *sR = const_cast<const SPUString *>( 1.115 + static_cast<SPUString *>(right.pointer)); 1.116 + int32_t lenL = sL->fStr->length(); 1.117 + int32_t lenR = sR->fStr->length(); 1.118 + if (lenL < lenR) { 1.119 + return -1; 1.120 + } else if (lenL > lenR) { 1.121 + return 1; 1.122 + } else { 1.123 + return sL->fStr->compare(*(sR->fStr)); 1.124 + } 1.125 +} 1.126 + 1.127 +void SPUStringPool::sort(UErrorCode &status) { 1.128 + fVec->sort(SPUStringCompare, status); 1.129 +} 1.130 + 1.131 + 1.132 +SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { 1.133 + SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src)); 1.134 + if (hashedString != NULL) { 1.135 + delete src; 1.136 + } else { 1.137 + hashedString = new SPUString(src); 1.138 + uhash_put(fHash, src, hashedString, &status); 1.139 + fVec->addElement(hashedString, status); 1.140 + } 1.141 + return hashedString; 1.142 +} 1.143 + 1.144 + 1.145 + 1.146 +ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : 1.147 + fSpoofImpl(spImpl), 1.148 + fInput(NULL), 1.149 + fSLTable(NULL), 1.150 + fSATable(NULL), 1.151 + fMLTable(NULL), 1.152 + fMATable(NULL), 1.153 + fKeySet(NULL), 1.154 + fKeyVec(NULL), 1.155 + fValueVec(NULL), 1.156 + fStringTable(NULL), 1.157 + fStringLengthsTable(NULL), 1.158 + stringPool(NULL), 1.159 + fParseLine(NULL), 1.160 + fParseHexNum(NULL), 1.161 + fLineNum(0) 1.162 +{ 1.163 + if (U_FAILURE(status)) { 1.164 + return; 1.165 + } 1.166 + fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 1.167 + fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 1.168 + fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 1.169 + fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 1.170 + fKeySet = new UnicodeSet(); 1.171 + fKeyVec = new UVector(status); 1.172 + fValueVec = new UVector(status); 1.173 + stringPool = new SPUStringPool(status); 1.174 +} 1.175 + 1.176 + 1.177 +ConfusabledataBuilder::~ConfusabledataBuilder() { 1.178 + uprv_free(fInput); 1.179 + uregex_close(fParseLine); 1.180 + uregex_close(fParseHexNum); 1.181 + uhash_close(fSLTable); 1.182 + uhash_close(fSATable); 1.183 + uhash_close(fMLTable); 1.184 + uhash_close(fMATable); 1.185 + delete fKeySet; 1.186 + delete fKeyVec; 1.187 + delete fStringTable; 1.188 + delete fStringLengthsTable; 1.189 + delete fValueVec; 1.190 + delete stringPool; 1.191 +} 1.192 + 1.193 + 1.194 +void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, 1.195 + int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { 1.196 + 1.197 + if (U_FAILURE(status)) { 1.198 + return; 1.199 + } 1.200 + ConfusabledataBuilder builder(spImpl, status); 1.201 + builder.build(confusables, confusablesLen, status); 1.202 + if (U_FAILURE(status) && errorType != NULL) { 1.203 + *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 1.204 + pe->line = builder.fLineNum; 1.205 + } 1.206 +} 1.207 + 1.208 + 1.209 +void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, 1.210 + UErrorCode &status) { 1.211 + 1.212 + // Convert the user input data from UTF-8 to UChar (UTF-16) 1.213 + int32_t inputLen = 0; 1.214 + if (U_FAILURE(status)) { 1.215 + return; 1.216 + } 1.217 + u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); 1.218 + if (status != U_BUFFER_OVERFLOW_ERROR) { 1.219 + return; 1.220 + } 1.221 + status = U_ZERO_ERROR; 1.222 + fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 1.223 + if (fInput == NULL) { 1.224 + status = U_MEMORY_ALLOCATION_ERROR; 1.225 + return; 1.226 + } 1.227 + u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); 1.228 + 1.229 + 1.230 + // Regular Expression to parse a line from Confusables.txt. The expression will match 1.231 + // any line. What was matched is determined by examining which capture groups have a match. 1.232 + // Capture Group 1: the source char 1.233 + // Capture Group 2: the replacement chars 1.234 + // Capture Group 3-6 the table type, SL, SA, ML, or MA 1.235 + // Capture Group 7: A blank or comment only line. 1.236 + // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 1.237 + // Example Line from the confusables.txt source file: 1.238 + // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 1.239 + UnicodeString pattern( 1.240 + "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char 1.241 + "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) 1.242 + "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) 1.243 + "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type 1.244 + "[ \\t]*(?:#.*?)?$" // Match any trailing #comment 1.245 + "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment 1.246 + "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. 1.247 + // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... 1.248 + fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 1.249 + 1.250 + // Regular expression for parsing a hex number out of a space-separated list of them. 1.251 + // Capture group 1 gets the number, with spaces removed. 1.252 + pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); 1.253 + fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 1.254 + 1.255 + // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 1.256 + // given the syntax of the input. 1.257 + if (*fInput == 0xfeff) { 1.258 + *fInput = 0x20; 1.259 + } 1.260 + 1.261 + // Parse the input, one line per iteration of this loop. 1.262 + uregex_setText(fParseLine, fInput, inputLen, &status); 1.263 + while (uregex_findNext(fParseLine, &status)) { 1.264 + fLineNum++; 1.265 + if (uregex_start(fParseLine, 7, &status) >= 0) { 1.266 + // this was a blank or comment line. 1.267 + continue; 1.268 + } 1.269 + if (uregex_start(fParseLine, 8, &status) >= 0) { 1.270 + // input file syntax error. 1.271 + status = U_PARSE_ERROR; 1.272 + return; 1.273 + } 1.274 + 1.275 + // We have a good input line. Extract the key character and mapping string, and 1.276 + // put them into the appropriate mapping table. 1.277 + UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), 1.278 + uregex_end(fParseLine, 1, &status), status); 1.279 + 1.280 + int32_t mapStringStart = uregex_start(fParseLine, 2, &status); 1.281 + int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; 1.282 + uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); 1.283 + 1.284 + UnicodeString *mapString = new UnicodeString(); 1.285 + if (mapString == NULL) { 1.286 + status = U_MEMORY_ALLOCATION_ERROR; 1.287 + return; 1.288 + } 1.289 + while (uregex_findNext(fParseHexNum, &status)) { 1.290 + UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), 1.291 + uregex_end(fParseHexNum, 1, &status), status); 1.292 + mapString->append(c); 1.293 + } 1.294 + U_ASSERT(mapString->length() >= 1); 1.295 + 1.296 + // Put the map (value) string into the string pool 1.297 + // This a little like a Java intern() - any duplicates will be eliminated. 1.298 + SPUString *smapString = stringPool->addString(mapString, status); 1.299 + 1.300 + // Add the UChar32 -> string mapping to the appropriate table. 1.301 + UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable : 1.302 + uregex_start(fParseLine, 4, &status) >= 0 ? fSATable : 1.303 + uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable : 1.304 + uregex_start(fParseLine, 6, &status) >= 0 ? fMATable : 1.305 + NULL; 1.306 + U_ASSERT(table != NULL); 1.307 + uhash_iput(table, keyChar, smapString, &status); 1.308 + fKeySet->add(keyChar); 1.309 + if (U_FAILURE(status)) { 1.310 + return; 1.311 + } 1.312 + } 1.313 + 1.314 + // Input data is now all parsed and collected. 1.315 + // Now create the run-time binary form of the data. 1.316 + // 1.317 + // This is done in two steps. First the data is assembled into vectors and strings, 1.318 + // for ease of construction, then the contents of these collections are dumped 1.319 + // into the actual raw-bytes data storage. 1.320 + 1.321 + // Build up the string array, and record the index of each string therein 1.322 + // in the (build time only) string pool. 1.323 + // Strings of length one are not entered into the strings array. 1.324 + // At the same time, build up the string lengths table, which records the 1.325 + // position in the string table of the first string of each length >= 4. 1.326 + // (Strings in the table are sorted by length) 1.327 + stringPool->sort(status); 1.328 + fStringTable = new UnicodeString(); 1.329 + fStringLengthsTable = new UVector(status); 1.330 + int32_t previousStringLength = 0; 1.331 + int32_t previousStringIndex = 0; 1.332 + int32_t poolSize = stringPool->size(); 1.333 + int32_t i; 1.334 + for (i=0; i<poolSize; i++) { 1.335 + SPUString *s = stringPool->getByIndex(i); 1.336 + int32_t strLen = s->fStr->length(); 1.337 + int32_t strIndex = fStringTable->length(); 1.338 + U_ASSERT(strLen >= previousStringLength); 1.339 + if (strLen == 1) { 1.340 + // strings of length one do not get an entry in the string table. 1.341 + // Keep the single string character itself here, which is the same 1.342 + // convention that is used in the final run-time string table index. 1.343 + s->fStrTableIndex = s->fStr->charAt(0); 1.344 + } else { 1.345 + if ((strLen > previousStringLength) && (previousStringLength >= 4)) { 1.346 + fStringLengthsTable->addElement(previousStringIndex, status); 1.347 + fStringLengthsTable->addElement(previousStringLength, status); 1.348 + } 1.349 + s->fStrTableIndex = strIndex; 1.350 + fStringTable->append(*(s->fStr)); 1.351 + } 1.352 + previousStringLength = strLen; 1.353 + previousStringIndex = strIndex; 1.354 + } 1.355 + // Make the final entry to the string lengths table. 1.356 + // (it holds an entry for the _last_ string of each length, so adding the 1.357 + // final one doesn't happen in the main loop because no longer string was encountered.) 1.358 + if (previousStringLength >= 4) { 1.359 + fStringLengthsTable->addElement(previousStringIndex, status); 1.360 + fStringLengthsTable->addElement(previousStringLength, status); 1.361 + } 1.362 + 1.363 + // Construct the compile-time Key and Value tables 1.364 + // 1.365 + // For each key code point, check which mapping tables it applies to, 1.366 + // and create the final data for the key & value structures. 1.367 + // 1.368 + // The four logical mapping tables are conflated into one combined table. 1.369 + // If multiple logical tables have the same mapping for some key, they 1.370 + // share a single entry in the combined table. 1.371 + // If more than one mapping exists for the same key code point, multiple 1.372 + // entries will be created in the table 1.373 + 1.374 + for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { 1.375 + // It is an oddity of the UnicodeSet API that simply enumerating the contained 1.376 + // code points requires a nested loop. 1.377 + for (UChar32 keyChar=fKeySet->getRangeStart(range); 1.378 + keyChar <= fKeySet->getRangeEnd(range); keyChar++) { 1.379 + addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status); 1.380 + addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status); 1.381 + addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status); 1.382 + addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status); 1.383 + } 1.384 + } 1.385 + 1.386 + // Put the assembled data into the flat runtime array 1.387 + outputData(status); 1.388 + 1.389 + // All of the intermediate allocated data belongs to the ConfusabledataBuilder 1.390 + // object (this), and is deleted in the destructor. 1.391 + return; 1.392 +} 1.393 + 1.394 +// 1.395 +// outputData The confusable data has been compiled and stored in intermediate 1.396 +// collections and strings. Copy it from there to the final flat 1.397 +// binary array. 1.398 +// 1.399 +// Note that as each section is added to the output data, the 1.400 +// expand (reserveSpace() function will likely relocate it in memory. 1.401 +// Be careful with pointers. 1.402 +// 1.403 +void ConfusabledataBuilder::outputData(UErrorCode &status) { 1.404 + 1.405 + U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE); 1.406 + 1.407 + // The Key Table 1.408 + // While copying the keys to the runtime array, 1.409 + // also sanity check that they are sorted. 1.410 + 1.411 + int32_t numKeys = fKeyVec->size(); 1.412 + int32_t *keys = 1.413 + static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); 1.414 + if (U_FAILURE(status)) { 1.415 + return; 1.416 + } 1.417 + int i; 1.418 + int32_t previousKey = 0; 1.419 + for (i=0; i<numKeys; i++) { 1.420 + int32_t key = fKeyVec->elementAti(i); 1.421 + (void)previousKey; // Suppress unused variable warning on gcc. 1.422 + U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff)); 1.423 + U_ASSERT((key & 0xff000000) != 0); 1.424 + keys[i] = key; 1.425 + previousKey = key; 1.426 + } 1.427 + SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; 1.428 + rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData); 1.429 + rawData->fCFUKeysSize = numKeys; 1.430 + fSpoofImpl->fSpoofData->fCFUKeys = keys; 1.431 + 1.432 + 1.433 + // The Value Table, parallels the key table 1.434 + int32_t numValues = fValueVec->size(); 1.435 + U_ASSERT(numKeys == numValues); 1.436 + uint16_t *values = 1.437 + static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); 1.438 + if (U_FAILURE(status)) { 1.439 + return; 1.440 + } 1.441 + for (i=0; i<numValues; i++) { 1.442 + uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i)); 1.443 + U_ASSERT(value < 0xffff); 1.444 + values[i] = static_cast<uint16_t>(value); 1.445 + } 1.446 + rawData = fSpoofImpl->fSpoofData->fRawData; 1.447 + rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData); 1.448 + rawData->fCFUStringIndexSize = numValues; 1.449 + fSpoofImpl->fSpoofData->fCFUValues = values; 1.450 + 1.451 + // The Strings Table. 1.452 + 1.453 + uint32_t stringsLength = fStringTable->length(); 1.454 + // Reserve an extra space so the string will be nul-terminated. This is 1.455 + // only a convenience, for when debugging; it is not needed otherwise. 1.456 + UChar *strings = 1.457 + static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status)); 1.458 + if (U_FAILURE(status)) { 1.459 + return; 1.460 + } 1.461 + fStringTable->extract(strings, stringsLength+1, status); 1.462 + rawData = fSpoofImpl->fSpoofData->fRawData; 1.463 + U_ASSERT(rawData->fCFUStringTable == 0); 1.464 + rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData); 1.465 + rawData->fCFUStringTableLen = stringsLength; 1.466 + fSpoofImpl->fSpoofData->fCFUStrings = strings; 1.467 + 1.468 + // The String Lengths Table 1.469 + // While copying into the runtime array do some sanity checks on the values 1.470 + // Each complete entry contains two fields, an index and an offset. 1.471 + // Lengths should increase with each entry. 1.472 + // Offsets should be less than the size of the string table. 1.473 + int32_t lengthTableLength = fStringLengthsTable->size(); 1.474 + uint16_t *stringLengths = 1.475 + static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status)); 1.476 + if (U_FAILURE(status)) { 1.477 + return; 1.478 + } 1.479 + int32_t destIndex = 0; 1.480 + uint32_t previousLength = 0; 1.481 + for (i=0; i<lengthTableLength; i+=2) { 1.482 + uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i)); 1.483 + uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1)); 1.484 + U_ASSERT(offset < stringsLength); 1.485 + U_ASSERT(length < 40); 1.486 + (void)previousLength; // Suppress unused variable warning on gcc. 1.487 + U_ASSERT(length > previousLength); 1.488 + stringLengths[destIndex++] = static_cast<uint16_t>(offset); 1.489 + stringLengths[destIndex++] = static_cast<uint16_t>(length); 1.490 + previousLength = length; 1.491 + } 1.492 + rawData = fSpoofImpl->fSpoofData->fRawData; 1.493 + rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData); 1.494 + // Note: StringLengthsSize in the raw data is the number of complete entries, 1.495 + // each consisting of a pair of 16 bit values, hence the divide by 2. 1.496 + rawData->fCFUStringLengthsSize = lengthTableLength / 2; 1.497 + fSpoofImpl->fSpoofData->fCFUStringLengths = 1.498 + reinterpret_cast<SpoofStringLengthsElement *>(stringLengths); 1.499 +} 1.500 + 1.501 + 1.502 + 1.503 +// addKeyEntry Construction of the confusable Key and Mapping Values tables. 1.504 +// This is an intermediate point in the building process. 1.505 +// We already have the mappings in the hash tables fSLTable, etc. 1.506 +// This function builds corresponding run-time style table entries into 1.507 +// fKeyVec and fValueVec 1.508 + 1.509 +void ConfusabledataBuilder::addKeyEntry( 1.510 + UChar32 keyChar, // The key character 1.511 + UHashtable *table, // The table, one of SATable, MATable, etc. 1.512 + int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc. 1.513 + UErrorCode &status) { 1.514 + 1.515 + SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar)); 1.516 + if (targetMapping == NULL) { 1.517 + // No mapping for this key character. 1.518 + // (This function is called for all four tables for each key char that 1.519 + // is seen anywhere, so this no entry cases are very much expected.) 1.520 + return; 1.521 + } 1.522 + 1.523 + // Check whether there is already an entry with the correct mapping. 1.524 + // If so, simply set the flag in the keyTable saying that the existing entry 1.525 + // applies to the table that we're doing now. 1.526 + 1.527 + UBool keyHasMultipleValues = FALSE; 1.528 + int32_t i; 1.529 + for (i=fKeyVec->size()-1; i>=0 ; i--) { 1.530 + int32_t key = fKeyVec->elementAti(i); 1.531 + if ((key & 0x0ffffff) != keyChar) { 1.532 + // We have now checked all existing key entries for this key char (if any) 1.533 + // without finding one with the same mapping. 1.534 + break; 1.535 + } 1.536 + UnicodeString mapping = getMapping(i); 1.537 + if (mapping == *(targetMapping->fStr)) { 1.538 + // The run time entry we are currently testing has the correct mapping. 1.539 + // Set the flag in it indicating that it applies to the new table also. 1.540 + key |= tableFlag; 1.541 + fKeyVec->setElementAt(key, i); 1.542 + return; 1.543 + } 1.544 + keyHasMultipleValues = TRUE; 1.545 + } 1.546 + 1.547 + // Need to add a new entry to the binary data being built for this mapping. 1.548 + // Includes adding entries to both the key table and the parallel values table. 1.549 + 1.550 + int32_t newKey = keyChar | tableFlag; 1.551 + if (keyHasMultipleValues) { 1.552 + newKey |= USPOOF_KEY_MULTIPLE_VALUES; 1.553 + } 1.554 + int32_t adjustedMappingLength = targetMapping->fStr->length() - 1; 1.555 + if (adjustedMappingLength>3) { 1.556 + adjustedMappingLength = 3; 1.557 + } 1.558 + newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT; 1.559 + 1.560 + int32_t newData = targetMapping->fStrTableIndex; 1.561 + 1.562 + fKeyVec->addElement(newKey, status); 1.563 + fValueVec->addElement(newData, status); 1.564 + 1.565 + // If the preceding key entry is for the same key character (but with a different mapping) 1.566 + // set the multiple-values flag on it. 1.567 + if (keyHasMultipleValues) { 1.568 + int32_t previousKeyIndex = fKeyVec->size() - 2; 1.569 + int32_t previousKey = fKeyVec->elementAti(previousKeyIndex); 1.570 + previousKey |= USPOOF_KEY_MULTIPLE_VALUES; 1.571 + fKeyVec->setElementAt(previousKey, previousKeyIndex); 1.572 + } 1.573 +} 1.574 + 1.575 + 1.576 + 1.577 +UnicodeString ConfusabledataBuilder::getMapping(int32_t index) { 1.578 + int32_t key = fKeyVec->elementAti(index); 1.579 + int32_t value = fValueVec->elementAti(index); 1.580 + int32_t length = USPOOF_KEY_LENGTH_FIELD(key); 1.581 + int32_t lastIndexWithLen; 1.582 + switch (length) { 1.583 + case 0: 1.584 + return UnicodeString(static_cast<UChar>(value)); 1.585 + case 1: 1.586 + case 2: 1.587 + return UnicodeString(*fStringTable, value, length+1); 1.588 + case 3: 1.589 + length = 0; 1.590 + int32_t i; 1.591 + for (i=0; i<fStringLengthsTable->size(); i+=2) { 1.592 + lastIndexWithLen = fStringLengthsTable->elementAti(i); 1.593 + if (value <= lastIndexWithLen) { 1.594 + length = fStringLengthsTable->elementAti(i+1); 1.595 + break; 1.596 + } 1.597 + } 1.598 + U_ASSERT(length>=3); 1.599 + return UnicodeString(*fStringTable, value, length); 1.600 + default: 1.601 + U_ASSERT(FALSE); 1.602 + } 1.603 + return UnicodeString(); 1.604 +} 1.605 + 1.606 +#endif 1.607 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1.608 +