Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2008-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: uspoof_conf.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2009Jan05 (refactoring earlier files) |
michael@0 | 14 | * created by: Andy Heninger |
michael@0 | 15 | * |
michael@0 | 16 | * Internal classes for compililing confusable data into its binary (runtime) form. |
michael@0 | 17 | */ |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/utypes.h" |
michael@0 | 20 | #include "unicode/uspoof.h" |
michael@0 | 21 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 22 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 23 | |
michael@0 | 24 | #include "unicode/unorm.h" |
michael@0 | 25 | #include "unicode/uregex.h" |
michael@0 | 26 | #include "unicode/ustring.h" |
michael@0 | 27 | #include "cmemory.h" |
michael@0 | 28 | #include "uspoof_impl.h" |
michael@0 | 29 | #include "uhash.h" |
michael@0 | 30 | #include "uvector.h" |
michael@0 | 31 | #include "uassert.h" |
michael@0 | 32 | #include "uarrsort.h" |
michael@0 | 33 | #include "uspoof_conf.h" |
michael@0 | 34 | |
michael@0 | 35 | U_NAMESPACE_USE |
michael@0 | 36 | |
michael@0 | 37 | |
michael@0 | 38 | //--------------------------------------------------------------------- |
michael@0 | 39 | // |
michael@0 | 40 | // buildConfusableData Compile the source confusable data, as defined by |
michael@0 | 41 | // the Unicode data file confusables.txt, into the binary |
michael@0 | 42 | // structures used by the confusable detector. |
michael@0 | 43 | // |
michael@0 | 44 | // The binary structures are described in uspoof_impl.h |
michael@0 | 45 | // |
michael@0 | 46 | // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA |
michael@0 | 47 | // tables. Each maps from a UChar32 to a String. |
michael@0 | 48 | // |
michael@0 | 49 | // 2. Sort all of the strings encountered by length, since they will need to |
michael@0 | 50 | // be stored in that order in the final string table. |
michael@0 | 51 | // |
michael@0 | 52 | // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the |
michael@0 | 53 | // list because that will be the ordering of our runtime table. |
michael@0 | 54 | // |
michael@0 | 55 | // 4. Generate the run time string table. This is generated before the key & value |
michael@0 | 56 | // tables because we need the string indexes when building those tables. |
michael@0 | 57 | // |
michael@0 | 58 | // 5. Build the run-time key and value tables. These are parallel tables, and are built |
michael@0 | 59 | // at the same time |
michael@0 | 60 | // |
michael@0 | 61 | |
michael@0 | 62 | SPUString::SPUString(UnicodeString *s) { |
michael@0 | 63 | fStr = s; |
michael@0 | 64 | fStrTableIndex = 0; |
michael@0 | 65 | } |
michael@0 | 66 | |
michael@0 | 67 | |
michael@0 | 68 | SPUString::~SPUString() { |
michael@0 | 69 | delete fStr; |
michael@0 | 70 | } |
michael@0 | 71 | |
michael@0 | 72 | |
michael@0 | 73 | SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) { |
michael@0 | 74 | fVec = new UVector(status); |
michael@0 | 75 | fHash = uhash_open(uhash_hashUnicodeString, // key hash function |
michael@0 | 76 | uhash_compareUnicodeString, // Key Comparator |
michael@0 | 77 | NULL, // Value Comparator |
michael@0 | 78 | &status); |
michael@0 | 79 | } |
michael@0 | 80 | |
michael@0 | 81 | |
michael@0 | 82 | SPUStringPool::~SPUStringPool() { |
michael@0 | 83 | int i; |
michael@0 | 84 | for (i=fVec->size()-1; i>=0; i--) { |
michael@0 | 85 | SPUString *s = static_cast<SPUString *>(fVec->elementAt(i)); |
michael@0 | 86 | delete s; |
michael@0 | 87 | } |
michael@0 | 88 | delete fVec; |
michael@0 | 89 | uhash_close(fHash); |
michael@0 | 90 | } |
michael@0 | 91 | |
michael@0 | 92 | |
michael@0 | 93 | int32_t SPUStringPool::size() { |
michael@0 | 94 | return fVec->size(); |
michael@0 | 95 | } |
michael@0 | 96 | |
michael@0 | 97 | SPUString *SPUStringPool::getByIndex(int32_t index) { |
michael@0 | 98 | SPUString *retString = (SPUString *)fVec->elementAt(index); |
michael@0 | 99 | return retString; |
michael@0 | 100 | } |
michael@0 | 101 | |
michael@0 | 102 | |
michael@0 | 103 | // Comparison function for ordering strings in the string pool. |
michael@0 | 104 | // Compare by length first, then, within a group of the same length, |
michael@0 | 105 | // by code point order. |
michael@0 | 106 | // Conforms to the type signature for a USortComparator in uvector.h |
michael@0 | 107 | |
michael@0 | 108 | static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { |
michael@0 | 109 | const SPUString *sL = const_cast<const SPUString *>( |
michael@0 | 110 | static_cast<SPUString *>(left.pointer)); |
michael@0 | 111 | const SPUString *sR = const_cast<const SPUString *>( |
michael@0 | 112 | static_cast<SPUString *>(right.pointer)); |
michael@0 | 113 | int32_t lenL = sL->fStr->length(); |
michael@0 | 114 | int32_t lenR = sR->fStr->length(); |
michael@0 | 115 | if (lenL < lenR) { |
michael@0 | 116 | return -1; |
michael@0 | 117 | } else if (lenL > lenR) { |
michael@0 | 118 | return 1; |
michael@0 | 119 | } else { |
michael@0 | 120 | return sL->fStr->compare(*(sR->fStr)); |
michael@0 | 121 | } |
michael@0 | 122 | } |
michael@0 | 123 | |
michael@0 | 124 | void SPUStringPool::sort(UErrorCode &status) { |
michael@0 | 125 | fVec->sort(SPUStringCompare, status); |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | |
michael@0 | 129 | SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { |
michael@0 | 130 | SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src)); |
michael@0 | 131 | if (hashedString != NULL) { |
michael@0 | 132 | delete src; |
michael@0 | 133 | } else { |
michael@0 | 134 | hashedString = new SPUString(src); |
michael@0 | 135 | uhash_put(fHash, src, hashedString, &status); |
michael@0 | 136 | fVec->addElement(hashedString, status); |
michael@0 | 137 | } |
michael@0 | 138 | return hashedString; |
michael@0 | 139 | } |
michael@0 | 140 | |
michael@0 | 141 | |
michael@0 | 142 | |
michael@0 | 143 | ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : |
michael@0 | 144 | fSpoofImpl(spImpl), |
michael@0 | 145 | fInput(NULL), |
michael@0 | 146 | fSLTable(NULL), |
michael@0 | 147 | fSATable(NULL), |
michael@0 | 148 | fMLTable(NULL), |
michael@0 | 149 | fMATable(NULL), |
michael@0 | 150 | fKeySet(NULL), |
michael@0 | 151 | fKeyVec(NULL), |
michael@0 | 152 | fValueVec(NULL), |
michael@0 | 153 | fStringTable(NULL), |
michael@0 | 154 | fStringLengthsTable(NULL), |
michael@0 | 155 | stringPool(NULL), |
michael@0 | 156 | fParseLine(NULL), |
michael@0 | 157 | fParseHexNum(NULL), |
michael@0 | 158 | fLineNum(0) |
michael@0 | 159 | { |
michael@0 | 160 | if (U_FAILURE(status)) { |
michael@0 | 161 | return; |
michael@0 | 162 | } |
michael@0 | 163 | fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); |
michael@0 | 164 | fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); |
michael@0 | 165 | fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); |
michael@0 | 166 | fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); |
michael@0 | 167 | fKeySet = new UnicodeSet(); |
michael@0 | 168 | fKeyVec = new UVector(status); |
michael@0 | 169 | fValueVec = new UVector(status); |
michael@0 | 170 | stringPool = new SPUStringPool(status); |
michael@0 | 171 | } |
michael@0 | 172 | |
michael@0 | 173 | |
michael@0 | 174 | ConfusabledataBuilder::~ConfusabledataBuilder() { |
michael@0 | 175 | uprv_free(fInput); |
michael@0 | 176 | uregex_close(fParseLine); |
michael@0 | 177 | uregex_close(fParseHexNum); |
michael@0 | 178 | uhash_close(fSLTable); |
michael@0 | 179 | uhash_close(fSATable); |
michael@0 | 180 | uhash_close(fMLTable); |
michael@0 | 181 | uhash_close(fMATable); |
michael@0 | 182 | delete fKeySet; |
michael@0 | 183 | delete fKeyVec; |
michael@0 | 184 | delete fStringTable; |
michael@0 | 185 | delete fStringLengthsTable; |
michael@0 | 186 | delete fValueVec; |
michael@0 | 187 | delete stringPool; |
michael@0 | 188 | } |
michael@0 | 189 | |
michael@0 | 190 | |
michael@0 | 191 | void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, |
michael@0 | 192 | int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { |
michael@0 | 193 | |
michael@0 | 194 | if (U_FAILURE(status)) { |
michael@0 | 195 | return; |
michael@0 | 196 | } |
michael@0 | 197 | ConfusabledataBuilder builder(spImpl, status); |
michael@0 | 198 | builder.build(confusables, confusablesLen, status); |
michael@0 | 199 | if (U_FAILURE(status) && errorType != NULL) { |
michael@0 | 200 | *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; |
michael@0 | 201 | pe->line = builder.fLineNum; |
michael@0 | 202 | } |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | |
michael@0 | 206 | void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, |
michael@0 | 207 | UErrorCode &status) { |
michael@0 | 208 | |
michael@0 | 209 | // Convert the user input data from UTF-8 to UChar (UTF-16) |
michael@0 | 210 | int32_t inputLen = 0; |
michael@0 | 211 | if (U_FAILURE(status)) { |
michael@0 | 212 | return; |
michael@0 | 213 | } |
michael@0 | 214 | u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); |
michael@0 | 215 | if (status != U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 216 | return; |
michael@0 | 217 | } |
michael@0 | 218 | status = U_ZERO_ERROR; |
michael@0 | 219 | fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); |
michael@0 | 220 | if (fInput == NULL) { |
michael@0 | 221 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 222 | return; |
michael@0 | 223 | } |
michael@0 | 224 | u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); |
michael@0 | 225 | |
michael@0 | 226 | |
michael@0 | 227 | // Regular Expression to parse a line from Confusables.txt. The expression will match |
michael@0 | 228 | // any line. What was matched is determined by examining which capture groups have a match. |
michael@0 | 229 | // Capture Group 1: the source char |
michael@0 | 230 | // Capture Group 2: the replacement chars |
michael@0 | 231 | // Capture Group 3-6 the table type, SL, SA, ML, or MA |
michael@0 | 232 | // Capture Group 7: A blank or comment only line. |
michael@0 | 233 | // Capture Group 8: A syntactically invalid line. Anything that didn't match before. |
michael@0 | 234 | // Example Line from the confusables.txt source file: |
michael@0 | 235 | // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " |
michael@0 | 236 | UnicodeString pattern( |
michael@0 | 237 | "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char |
michael@0 | 238 | "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) |
michael@0 | 239 | "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) |
michael@0 | 240 | "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type |
michael@0 | 241 | "[ \\t]*(?:#.*?)?$" // Match any trailing #comment |
michael@0 | 242 | "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment |
michael@0 | 243 | "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. |
michael@0 | 244 | // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... |
michael@0 | 245 | fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); |
michael@0 | 246 | |
michael@0 | 247 | // Regular expression for parsing a hex number out of a space-separated list of them. |
michael@0 | 248 | // Capture group 1 gets the number, with spaces removed. |
michael@0 | 249 | pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); |
michael@0 | 250 | fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); |
michael@0 | 251 | |
michael@0 | 252 | // Zap any Byte Order Mark at the start of input. Changing it to a space is benign |
michael@0 | 253 | // given the syntax of the input. |
michael@0 | 254 | if (*fInput == 0xfeff) { |
michael@0 | 255 | *fInput = 0x20; |
michael@0 | 256 | } |
michael@0 | 257 | |
michael@0 | 258 | // Parse the input, one line per iteration of this loop. |
michael@0 | 259 | uregex_setText(fParseLine, fInput, inputLen, &status); |
michael@0 | 260 | while (uregex_findNext(fParseLine, &status)) { |
michael@0 | 261 | fLineNum++; |
michael@0 | 262 | if (uregex_start(fParseLine, 7, &status) >= 0) { |
michael@0 | 263 | // this was a blank or comment line. |
michael@0 | 264 | continue; |
michael@0 | 265 | } |
michael@0 | 266 | if (uregex_start(fParseLine, 8, &status) >= 0) { |
michael@0 | 267 | // input file syntax error. |
michael@0 | 268 | status = U_PARSE_ERROR; |
michael@0 | 269 | return; |
michael@0 | 270 | } |
michael@0 | 271 | |
michael@0 | 272 | // We have a good input line. Extract the key character and mapping string, and |
michael@0 | 273 | // put them into the appropriate mapping table. |
michael@0 | 274 | UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), |
michael@0 | 275 | uregex_end(fParseLine, 1, &status), status); |
michael@0 | 276 | |
michael@0 | 277 | int32_t mapStringStart = uregex_start(fParseLine, 2, &status); |
michael@0 | 278 | int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; |
michael@0 | 279 | uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); |
michael@0 | 280 | |
michael@0 | 281 | UnicodeString *mapString = new UnicodeString(); |
michael@0 | 282 | if (mapString == NULL) { |
michael@0 | 283 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 284 | return; |
michael@0 | 285 | } |
michael@0 | 286 | while (uregex_findNext(fParseHexNum, &status)) { |
michael@0 | 287 | UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), |
michael@0 | 288 | uregex_end(fParseHexNum, 1, &status), status); |
michael@0 | 289 | mapString->append(c); |
michael@0 | 290 | } |
michael@0 | 291 | U_ASSERT(mapString->length() >= 1); |
michael@0 | 292 | |
michael@0 | 293 | // Put the map (value) string into the string pool |
michael@0 | 294 | // This a little like a Java intern() - any duplicates will be eliminated. |
michael@0 | 295 | SPUString *smapString = stringPool->addString(mapString, status); |
michael@0 | 296 | |
michael@0 | 297 | // Add the UChar32 -> string mapping to the appropriate table. |
michael@0 | 298 | UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable : |
michael@0 | 299 | uregex_start(fParseLine, 4, &status) >= 0 ? fSATable : |
michael@0 | 300 | uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable : |
michael@0 | 301 | uregex_start(fParseLine, 6, &status) >= 0 ? fMATable : |
michael@0 | 302 | NULL; |
michael@0 | 303 | U_ASSERT(table != NULL); |
michael@0 | 304 | uhash_iput(table, keyChar, smapString, &status); |
michael@0 | 305 | fKeySet->add(keyChar); |
michael@0 | 306 | if (U_FAILURE(status)) { |
michael@0 | 307 | return; |
michael@0 | 308 | } |
michael@0 | 309 | } |
michael@0 | 310 | |
michael@0 | 311 | // Input data is now all parsed and collected. |
michael@0 | 312 | // Now create the run-time binary form of the data. |
michael@0 | 313 | // |
michael@0 | 314 | // This is done in two steps. First the data is assembled into vectors and strings, |
michael@0 | 315 | // for ease of construction, then the contents of these collections are dumped |
michael@0 | 316 | // into the actual raw-bytes data storage. |
michael@0 | 317 | |
michael@0 | 318 | // Build up the string array, and record the index of each string therein |
michael@0 | 319 | // in the (build time only) string pool. |
michael@0 | 320 | // Strings of length one are not entered into the strings array. |
michael@0 | 321 | // At the same time, build up the string lengths table, which records the |
michael@0 | 322 | // position in the string table of the first string of each length >= 4. |
michael@0 | 323 | // (Strings in the table are sorted by length) |
michael@0 | 324 | stringPool->sort(status); |
michael@0 | 325 | fStringTable = new UnicodeString(); |
michael@0 | 326 | fStringLengthsTable = new UVector(status); |
michael@0 | 327 | int32_t previousStringLength = 0; |
michael@0 | 328 | int32_t previousStringIndex = 0; |
michael@0 | 329 | int32_t poolSize = stringPool->size(); |
michael@0 | 330 | int32_t i; |
michael@0 | 331 | for (i=0; i<poolSize; i++) { |
michael@0 | 332 | SPUString *s = stringPool->getByIndex(i); |
michael@0 | 333 | int32_t strLen = s->fStr->length(); |
michael@0 | 334 | int32_t strIndex = fStringTable->length(); |
michael@0 | 335 | U_ASSERT(strLen >= previousStringLength); |
michael@0 | 336 | if (strLen == 1) { |
michael@0 | 337 | // strings of length one do not get an entry in the string table. |
michael@0 | 338 | // Keep the single string character itself here, which is the same |
michael@0 | 339 | // convention that is used in the final run-time string table index. |
michael@0 | 340 | s->fStrTableIndex = s->fStr->charAt(0); |
michael@0 | 341 | } else { |
michael@0 | 342 | if ((strLen > previousStringLength) && (previousStringLength >= 4)) { |
michael@0 | 343 | fStringLengthsTable->addElement(previousStringIndex, status); |
michael@0 | 344 | fStringLengthsTable->addElement(previousStringLength, status); |
michael@0 | 345 | } |
michael@0 | 346 | s->fStrTableIndex = strIndex; |
michael@0 | 347 | fStringTable->append(*(s->fStr)); |
michael@0 | 348 | } |
michael@0 | 349 | previousStringLength = strLen; |
michael@0 | 350 | previousStringIndex = strIndex; |
michael@0 | 351 | } |
michael@0 | 352 | // Make the final entry to the string lengths table. |
michael@0 | 353 | // (it holds an entry for the _last_ string of each length, so adding the |
michael@0 | 354 | // final one doesn't happen in the main loop because no longer string was encountered.) |
michael@0 | 355 | if (previousStringLength >= 4) { |
michael@0 | 356 | fStringLengthsTable->addElement(previousStringIndex, status); |
michael@0 | 357 | fStringLengthsTable->addElement(previousStringLength, status); |
michael@0 | 358 | } |
michael@0 | 359 | |
michael@0 | 360 | // Construct the compile-time Key and Value tables |
michael@0 | 361 | // |
michael@0 | 362 | // For each key code point, check which mapping tables it applies to, |
michael@0 | 363 | // and create the final data for the key & value structures. |
michael@0 | 364 | // |
michael@0 | 365 | // The four logical mapping tables are conflated into one combined table. |
michael@0 | 366 | // If multiple logical tables have the same mapping for some key, they |
michael@0 | 367 | // share a single entry in the combined table. |
michael@0 | 368 | // If more than one mapping exists for the same key code point, multiple |
michael@0 | 369 | // entries will be created in the table |
michael@0 | 370 | |
michael@0 | 371 | for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { |
michael@0 | 372 | // It is an oddity of the UnicodeSet API that simply enumerating the contained |
michael@0 | 373 | // code points requires a nested loop. |
michael@0 | 374 | for (UChar32 keyChar=fKeySet->getRangeStart(range); |
michael@0 | 375 | keyChar <= fKeySet->getRangeEnd(range); keyChar++) { |
michael@0 | 376 | addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status); |
michael@0 | 377 | addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status); |
michael@0 | 378 | addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status); |
michael@0 | 379 | addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status); |
michael@0 | 380 | } |
michael@0 | 381 | } |
michael@0 | 382 | |
michael@0 | 383 | // Put the assembled data into the flat runtime array |
michael@0 | 384 | outputData(status); |
michael@0 | 385 | |
michael@0 | 386 | // All of the intermediate allocated data belongs to the ConfusabledataBuilder |
michael@0 | 387 | // object (this), and is deleted in the destructor. |
michael@0 | 388 | return; |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | // |
michael@0 | 392 | // outputData The confusable data has been compiled and stored in intermediate |
michael@0 | 393 | // collections and strings. Copy it from there to the final flat |
michael@0 | 394 | // binary array. |
michael@0 | 395 | // |
michael@0 | 396 | // Note that as each section is added to the output data, the |
michael@0 | 397 | // expand (reserveSpace() function will likely relocate it in memory. |
michael@0 | 398 | // Be careful with pointers. |
michael@0 | 399 | // |
michael@0 | 400 | void ConfusabledataBuilder::outputData(UErrorCode &status) { |
michael@0 | 401 | |
michael@0 | 402 | U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE); |
michael@0 | 403 | |
michael@0 | 404 | // The Key Table |
michael@0 | 405 | // While copying the keys to the runtime array, |
michael@0 | 406 | // also sanity check that they are sorted. |
michael@0 | 407 | |
michael@0 | 408 | int32_t numKeys = fKeyVec->size(); |
michael@0 | 409 | int32_t *keys = |
michael@0 | 410 | static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); |
michael@0 | 411 | if (U_FAILURE(status)) { |
michael@0 | 412 | return; |
michael@0 | 413 | } |
michael@0 | 414 | int i; |
michael@0 | 415 | int32_t previousKey = 0; |
michael@0 | 416 | for (i=0; i<numKeys; i++) { |
michael@0 | 417 | int32_t key = fKeyVec->elementAti(i); |
michael@0 | 418 | (void)previousKey; // Suppress unused variable warning on gcc. |
michael@0 | 419 | U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff)); |
michael@0 | 420 | U_ASSERT((key & 0xff000000) != 0); |
michael@0 | 421 | keys[i] = key; |
michael@0 | 422 | previousKey = key; |
michael@0 | 423 | } |
michael@0 | 424 | SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; |
michael@0 | 425 | rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData); |
michael@0 | 426 | rawData->fCFUKeysSize = numKeys; |
michael@0 | 427 | fSpoofImpl->fSpoofData->fCFUKeys = keys; |
michael@0 | 428 | |
michael@0 | 429 | |
michael@0 | 430 | // The Value Table, parallels the key table |
michael@0 | 431 | int32_t numValues = fValueVec->size(); |
michael@0 | 432 | U_ASSERT(numKeys == numValues); |
michael@0 | 433 | uint16_t *values = |
michael@0 | 434 | static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); |
michael@0 | 435 | if (U_FAILURE(status)) { |
michael@0 | 436 | return; |
michael@0 | 437 | } |
michael@0 | 438 | for (i=0; i<numValues; i++) { |
michael@0 | 439 | uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i)); |
michael@0 | 440 | U_ASSERT(value < 0xffff); |
michael@0 | 441 | values[i] = static_cast<uint16_t>(value); |
michael@0 | 442 | } |
michael@0 | 443 | rawData = fSpoofImpl->fSpoofData->fRawData; |
michael@0 | 444 | rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData); |
michael@0 | 445 | rawData->fCFUStringIndexSize = numValues; |
michael@0 | 446 | fSpoofImpl->fSpoofData->fCFUValues = values; |
michael@0 | 447 | |
michael@0 | 448 | // The Strings Table. |
michael@0 | 449 | |
michael@0 | 450 | uint32_t stringsLength = fStringTable->length(); |
michael@0 | 451 | // Reserve an extra space so the string will be nul-terminated. This is |
michael@0 | 452 | // only a convenience, for when debugging; it is not needed otherwise. |
michael@0 | 453 | UChar *strings = |
michael@0 | 454 | static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status)); |
michael@0 | 455 | if (U_FAILURE(status)) { |
michael@0 | 456 | return; |
michael@0 | 457 | } |
michael@0 | 458 | fStringTable->extract(strings, stringsLength+1, status); |
michael@0 | 459 | rawData = fSpoofImpl->fSpoofData->fRawData; |
michael@0 | 460 | U_ASSERT(rawData->fCFUStringTable == 0); |
michael@0 | 461 | rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData); |
michael@0 | 462 | rawData->fCFUStringTableLen = stringsLength; |
michael@0 | 463 | fSpoofImpl->fSpoofData->fCFUStrings = strings; |
michael@0 | 464 | |
michael@0 | 465 | // The String Lengths Table |
michael@0 | 466 | // While copying into the runtime array do some sanity checks on the values |
michael@0 | 467 | // Each complete entry contains two fields, an index and an offset. |
michael@0 | 468 | // Lengths should increase with each entry. |
michael@0 | 469 | // Offsets should be less than the size of the string table. |
michael@0 | 470 | int32_t lengthTableLength = fStringLengthsTable->size(); |
michael@0 | 471 | uint16_t *stringLengths = |
michael@0 | 472 | static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status)); |
michael@0 | 473 | if (U_FAILURE(status)) { |
michael@0 | 474 | return; |
michael@0 | 475 | } |
michael@0 | 476 | int32_t destIndex = 0; |
michael@0 | 477 | uint32_t previousLength = 0; |
michael@0 | 478 | for (i=0; i<lengthTableLength; i+=2) { |
michael@0 | 479 | uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i)); |
michael@0 | 480 | uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1)); |
michael@0 | 481 | U_ASSERT(offset < stringsLength); |
michael@0 | 482 | U_ASSERT(length < 40); |
michael@0 | 483 | (void)previousLength; // Suppress unused variable warning on gcc. |
michael@0 | 484 | U_ASSERT(length > previousLength); |
michael@0 | 485 | stringLengths[destIndex++] = static_cast<uint16_t>(offset); |
michael@0 | 486 | stringLengths[destIndex++] = static_cast<uint16_t>(length); |
michael@0 | 487 | previousLength = length; |
michael@0 | 488 | } |
michael@0 | 489 | rawData = fSpoofImpl->fSpoofData->fRawData; |
michael@0 | 490 | rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData); |
michael@0 | 491 | // Note: StringLengthsSize in the raw data is the number of complete entries, |
michael@0 | 492 | // each consisting of a pair of 16 bit values, hence the divide by 2. |
michael@0 | 493 | rawData->fCFUStringLengthsSize = lengthTableLength / 2; |
michael@0 | 494 | fSpoofImpl->fSpoofData->fCFUStringLengths = |
michael@0 | 495 | reinterpret_cast<SpoofStringLengthsElement *>(stringLengths); |
michael@0 | 496 | } |
michael@0 | 497 | |
michael@0 | 498 | |
michael@0 | 499 | |
michael@0 | 500 | // addKeyEntry Construction of the confusable Key and Mapping Values tables. |
michael@0 | 501 | // This is an intermediate point in the building process. |
michael@0 | 502 | // We already have the mappings in the hash tables fSLTable, etc. |
michael@0 | 503 | // This function builds corresponding run-time style table entries into |
michael@0 | 504 | // fKeyVec and fValueVec |
michael@0 | 505 | |
michael@0 | 506 | void ConfusabledataBuilder::addKeyEntry( |
michael@0 | 507 | UChar32 keyChar, // The key character |
michael@0 | 508 | UHashtable *table, // The table, one of SATable, MATable, etc. |
michael@0 | 509 | int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc. |
michael@0 | 510 | UErrorCode &status) { |
michael@0 | 511 | |
michael@0 | 512 | SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar)); |
michael@0 | 513 | if (targetMapping == NULL) { |
michael@0 | 514 | // No mapping for this key character. |
michael@0 | 515 | // (This function is called for all four tables for each key char that |
michael@0 | 516 | // is seen anywhere, so this no entry cases are very much expected.) |
michael@0 | 517 | return; |
michael@0 | 518 | } |
michael@0 | 519 | |
michael@0 | 520 | // Check whether there is already an entry with the correct mapping. |
michael@0 | 521 | // If so, simply set the flag in the keyTable saying that the existing entry |
michael@0 | 522 | // applies to the table that we're doing now. |
michael@0 | 523 | |
michael@0 | 524 | UBool keyHasMultipleValues = FALSE; |
michael@0 | 525 | int32_t i; |
michael@0 | 526 | for (i=fKeyVec->size()-1; i>=0 ; i--) { |
michael@0 | 527 | int32_t key = fKeyVec->elementAti(i); |
michael@0 | 528 | if ((key & 0x0ffffff) != keyChar) { |
michael@0 | 529 | // We have now checked all existing key entries for this key char (if any) |
michael@0 | 530 | // without finding one with the same mapping. |
michael@0 | 531 | break; |
michael@0 | 532 | } |
michael@0 | 533 | UnicodeString mapping = getMapping(i); |
michael@0 | 534 | if (mapping == *(targetMapping->fStr)) { |
michael@0 | 535 | // The run time entry we are currently testing has the correct mapping. |
michael@0 | 536 | // Set the flag in it indicating that it applies to the new table also. |
michael@0 | 537 | key |= tableFlag; |
michael@0 | 538 | fKeyVec->setElementAt(key, i); |
michael@0 | 539 | return; |
michael@0 | 540 | } |
michael@0 | 541 | keyHasMultipleValues = TRUE; |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | // Need to add a new entry to the binary data being built for this mapping. |
michael@0 | 545 | // Includes adding entries to both the key table and the parallel values table. |
michael@0 | 546 | |
michael@0 | 547 | int32_t newKey = keyChar | tableFlag; |
michael@0 | 548 | if (keyHasMultipleValues) { |
michael@0 | 549 | newKey |= USPOOF_KEY_MULTIPLE_VALUES; |
michael@0 | 550 | } |
michael@0 | 551 | int32_t adjustedMappingLength = targetMapping->fStr->length() - 1; |
michael@0 | 552 | if (adjustedMappingLength>3) { |
michael@0 | 553 | adjustedMappingLength = 3; |
michael@0 | 554 | } |
michael@0 | 555 | newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT; |
michael@0 | 556 | |
michael@0 | 557 | int32_t newData = targetMapping->fStrTableIndex; |
michael@0 | 558 | |
michael@0 | 559 | fKeyVec->addElement(newKey, status); |
michael@0 | 560 | fValueVec->addElement(newData, status); |
michael@0 | 561 | |
michael@0 | 562 | // If the preceding key entry is for the same key character (but with a different mapping) |
michael@0 | 563 | // set the multiple-values flag on it. |
michael@0 | 564 | if (keyHasMultipleValues) { |
michael@0 | 565 | int32_t previousKeyIndex = fKeyVec->size() - 2; |
michael@0 | 566 | int32_t previousKey = fKeyVec->elementAti(previousKeyIndex); |
michael@0 | 567 | previousKey |= USPOOF_KEY_MULTIPLE_VALUES; |
michael@0 | 568 | fKeyVec->setElementAt(previousKey, previousKeyIndex); |
michael@0 | 569 | } |
michael@0 | 570 | } |
michael@0 | 571 | |
michael@0 | 572 | |
michael@0 | 573 | |
michael@0 | 574 | UnicodeString ConfusabledataBuilder::getMapping(int32_t index) { |
michael@0 | 575 | int32_t key = fKeyVec->elementAti(index); |
michael@0 | 576 | int32_t value = fValueVec->elementAti(index); |
michael@0 | 577 | int32_t length = USPOOF_KEY_LENGTH_FIELD(key); |
michael@0 | 578 | int32_t lastIndexWithLen; |
michael@0 | 579 | switch (length) { |
michael@0 | 580 | case 0: |
michael@0 | 581 | return UnicodeString(static_cast<UChar>(value)); |
michael@0 | 582 | case 1: |
michael@0 | 583 | case 2: |
michael@0 | 584 | return UnicodeString(*fStringTable, value, length+1); |
michael@0 | 585 | case 3: |
michael@0 | 586 | length = 0; |
michael@0 | 587 | int32_t i; |
michael@0 | 588 | for (i=0; i<fStringLengthsTable->size(); i+=2) { |
michael@0 | 589 | lastIndexWithLen = fStringLengthsTable->elementAti(i); |
michael@0 | 590 | if (value <= lastIndexWithLen) { |
michael@0 | 591 | length = fStringLengthsTable->elementAti(i+1); |
michael@0 | 592 | break; |
michael@0 | 593 | } |
michael@0 | 594 | } |
michael@0 | 595 | U_ASSERT(length>=3); |
michael@0 | 596 | return UnicodeString(*fStringTable, value, length); |
michael@0 | 597 | default: |
michael@0 | 598 | U_ASSERT(FALSE); |
michael@0 | 599 | } |
michael@0 | 600 | return UnicodeString(); |
michael@0 | 601 | } |
michael@0 | 602 | |
michael@0 | 603 | #endif |
michael@0 | 604 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 605 |