intl/icu/source/i18n/uspoof_wsconf.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2008-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: uspoof_wsconf.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2009Jan05 (refactoring earlier files)
michael@0 14 * created by: Andy Heninger
michael@0 15 *
michael@0 16 * Internal functions for compililing Whole Script confusable source data
michael@0 17 * into its binary (runtime) form. The binary data format is described
michael@0 18 * in uspoof_impl.h
michael@0 19 */
michael@0 20
michael@0 21 #include "unicode/utypes.h"
michael@0 22 #include "unicode/uspoof.h"
michael@0 23
michael@0 24 #if !UCONFIG_NO_NORMALIZATION
michael@0 25
michael@0 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 27
michael@0 28 #include "unicode/unorm.h"
michael@0 29 #include "unicode/uregex.h"
michael@0 30 #include "unicode/ustring.h"
michael@0 31 #include "cmemory.h"
michael@0 32 #include "scriptset.h"
michael@0 33 #include "uspoof_impl.h"
michael@0 34 #include "uhash.h"
michael@0 35 #include "uvector.h"
michael@0 36 #include "uassert.h"
michael@0 37 #include "uspoof_wsconf.h"
michael@0 38
michael@0 39 U_NAMESPACE_USE
michael@0 40
michael@0 41
michael@0 42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
michael@0 43 // Example Lines:
michael@0 44 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
michael@0 45 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
michael@0 46 // | | | |
michael@0 47 // | | | |---- Which table, Any Case or Lower Case (A or L)
michael@0 48 // | | |----------Target script. We need this.
michael@0 49 // | |----------------Src script. Should match the script of the source
michael@0 50 // | code points. Beyond checking that, we don't keep it.
michael@0 51 // |--------------------------------Source code points or range.
michael@0 52 //
michael@0 53 // The expression will match _all_ lines, including erroneous lines.
michael@0 54 // The result of the parse is returned via the contents of the (match) groups.
michael@0 55 static const char *parseExp =
michael@0 56 "(?m)" // Multi-line mode
michael@0 57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
michael@0 58 "|^(?:" // OR
michael@0 59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
michael@0 60 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
michael@0 61 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
michael@0 62 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
michael@0 63 "[ \\t]*(?:#.*?)?" // Trailing commment
michael@0 64 ")$|" // OR
michael@0 65 "^(.*?)$"; // An error line. Group 8.
michael@0 66 // Any line not matching the preceding
michael@0 67 // parts of the expression.will match
michael@0 68 // this, and thus be flagged as an error
michael@0 69
michael@0 70
michael@0 71 // Extract a regular expression match group into a char * string.
michael@0 72 // The group must contain only invariant characters.
michael@0 73 // Used for script names
michael@0 74 //
michael@0 75 static void extractGroup(
michael@0 76 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
michael@0 77
michael@0 78 UChar ubuf[50];
michael@0 79 ubuf[0] = 0;
michael@0 80 destBuf[0] = 0;
michael@0 81 int32_t len = uregex_group(e, group, ubuf, 50, &status);
michael@0 82 if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
michael@0 83 return;
michael@0 84 }
michael@0 85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
michael@0 86 s.extract(0, len, destBuf, destCapacity, US_INV);
michael@0 87 }
michael@0 88
michael@0 89
michael@0 90
michael@0 91 U_NAMESPACE_BEGIN
michael@0 92
michael@0 93 // Build the Whole Script Confusable data
michael@0 94 //
michael@0 95 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
michael@0 96 // because everything is local to this one build function anyhow,
michael@0 97 // OR
michael@0 98 // break this function into more reasonably sized pieces, with
michael@0 99 // state in WSConfusableDataBuilder.
michael@0 100 //
michael@0 101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
michael@0 102 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
michael@0 103 {
michael@0 104 if (U_FAILURE(status)) {
michael@0 105 return;
michael@0 106 }
michael@0 107 URegularExpression *parseRegexp = NULL;
michael@0 108 int32_t inputLen = 0;
michael@0 109 UChar *input = NULL;
michael@0 110 int32_t lineNum = 0;
michael@0 111
michael@0 112 UVector *scriptSets = NULL;
michael@0 113 uint32_t rtScriptSetsCount = 2;
michael@0 114
michael@0 115 UTrie2 *anyCaseTrie = NULL;
michael@0 116 UTrie2 *lowerCaseTrie = NULL;
michael@0 117
michael@0 118 anyCaseTrie = utrie2_open(0, 0, &status);
michael@0 119 lowerCaseTrie = utrie2_open(0, 0, &status);
michael@0 120
michael@0 121 UnicodeString pattern(parseExp, -1, US_INV);
michael@0 122
michael@0 123 // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
michael@0 124 //
michael@0 125 // Reserved TRIE values:
michael@0 126 // 0: Code point has no whole script confusables.
michael@0 127 // 1: Code point is of script Common or Inherited.
michael@0 128 // These code points do not participate in whole script confusable detection.
michael@0 129 // (This is logically equivalent to saying that they contain confusables in
michael@0 130 // all scripts)
michael@0 131 //
michael@0 132 // Because Trie values are indexes into the ScriptSets vector, pre-fill
michael@0 133 // vector positions 0 and 1 to avoid conflicts with the reserved values.
michael@0 134
michael@0 135 scriptSets = new UVector(status);
michael@0 136 if (scriptSets == NULL) {
michael@0 137 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 138 goto cleanup;
michael@0 139 }
michael@0 140 scriptSets->addElement((void *)NULL, status);
michael@0 141 scriptSets->addElement((void *)NULL, status);
michael@0 142
michael@0 143 // Convert the user input data from UTF-8 to UChar (UTF-16)
michael@0 144 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
michael@0 145 if (status != U_BUFFER_OVERFLOW_ERROR) {
michael@0 146 goto cleanup;
michael@0 147 }
michael@0 148 status = U_ZERO_ERROR;
michael@0 149 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
michael@0 150 if (input == NULL) {
michael@0 151 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 152 goto cleanup;
michael@0 153 }
michael@0 154 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
michael@0 155
michael@0 156 parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
michael@0 157
michael@0 158 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
michael@0 159 // given the syntax of the input.
michael@0 160 if (*input == 0xfeff) {
michael@0 161 *input = 0x20;
michael@0 162 }
michael@0 163
michael@0 164 // Parse the input, one line per iteration of this loop.
michael@0 165 uregex_setText(parseRegexp, input, inputLen, &status);
michael@0 166 while (uregex_findNext(parseRegexp, &status)) {
michael@0 167 lineNum++;
michael@0 168 if (uregex_start(parseRegexp, 1, &status) >= 0) {
michael@0 169 // this was a blank or comment line.
michael@0 170 continue;
michael@0 171 }
michael@0 172 if (uregex_start(parseRegexp, 8, &status) >= 0) {
michael@0 173 // input file syntax error.
michael@0 174 status = U_PARSE_ERROR;
michael@0 175 goto cleanup;
michael@0 176 }
michael@0 177 if (U_FAILURE(status)) {
michael@0 178 goto cleanup;
michael@0 179 }
michael@0 180
michael@0 181 // Pick up the start and optional range end code points from the parsed line.
michael@0 182 UChar32 startCodePoint = SpoofImpl::ScanHex(
michael@0 183 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
michael@0 184 UChar32 endCodePoint = startCodePoint;
michael@0 185 if (uregex_start(parseRegexp, 3, &status) >=0) {
michael@0 186 endCodePoint = SpoofImpl::ScanHex(
michael@0 187 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
michael@0 188 }
michael@0 189
michael@0 190 // Extract the two script names from the source line. We need these in an 8 bit
michael@0 191 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
michael@0 192 // to the ICU u_getPropertyValueEnum() function. Ugh.
michael@0 193 char srcScriptName[20];
michael@0 194 char targScriptName[20];
michael@0 195 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
michael@0 196 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
michael@0 197 UScriptCode srcScript =
michael@0 198 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
michael@0 199 UScriptCode targScript =
michael@0 200 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
michael@0 201 if (U_FAILURE(status)) {
michael@0 202 goto cleanup;
michael@0 203 }
michael@0 204 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
michael@0 205 status = U_INVALID_FORMAT_ERROR;
michael@0 206 goto cleanup;
michael@0 207 }
michael@0 208
michael@0 209 // select the table - (A) any case or (L) lower case only
michael@0 210 UTrie2 *table = anyCaseTrie;
michael@0 211 if (uregex_start(parseRegexp, 7, &status) >= 0) {
michael@0 212 table = lowerCaseTrie;
michael@0 213 }
michael@0 214
michael@0 215 // Build the set of scripts containing confusable characters for
michael@0 216 // the code point(s) specified in this input line.
michael@0 217 // Sanity check that the script of the source code point is the same
michael@0 218 // as the source script indicated in the input file. Failure of this check is
michael@0 219 // an error in the input file.
michael@0 220 // Include the source script in the set (needed for Mixed Script Confusable detection).
michael@0 221 //
michael@0 222 UChar32 cp;
michael@0 223 for (cp=startCodePoint; cp<=endCodePoint; cp++) {
michael@0 224 int32_t setIndex = utrie2_get32(table, cp);
michael@0 225 BuilderScriptSet *bsset = NULL;
michael@0 226 if (setIndex > 0) {
michael@0 227 U_ASSERT(setIndex < scriptSets->size());
michael@0 228 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
michael@0 229 } else {
michael@0 230 bsset = new BuilderScriptSet();
michael@0 231 if (bsset == NULL) {
michael@0 232 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 233 goto cleanup;
michael@0 234 }
michael@0 235 bsset->codePoint = cp;
michael@0 236 bsset->trie = table;
michael@0 237 bsset->sset = new ScriptSet();
michael@0 238 setIndex = scriptSets->size();
michael@0 239 bsset->index = setIndex;
michael@0 240 bsset->rindex = 0;
michael@0 241 if (bsset->sset == NULL) {
michael@0 242 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 243 goto cleanup;
michael@0 244 }
michael@0 245 scriptSets->addElement(bsset, status);
michael@0 246 utrie2_set32(table, cp, setIndex, &status);
michael@0 247 }
michael@0 248 bsset->sset->set(targScript, status);
michael@0 249 bsset->sset->set(srcScript, status);
michael@0 250
michael@0 251 if (U_FAILURE(status)) {
michael@0 252 goto cleanup;
michael@0 253 }
michael@0 254 UScriptCode cpScript = uscript_getScript(cp, &status);
michael@0 255 if (cpScript != srcScript) {
michael@0 256 status = U_INVALID_FORMAT_ERROR;
michael@0 257 goto cleanup;
michael@0 258 }
michael@0 259 }
michael@0 260 }
michael@0 261
michael@0 262 // Eliminate duplicate script sets. At this point we have a separate
michael@0 263 // script set for every code point that had data in the input file.
michael@0 264 //
michael@0 265 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
michael@0 266 //
michael@0 267 // printf("Number of scriptSets: %d\n", scriptSets->size());
michael@0 268 {
michael@0 269 int32_t duplicateCount = 0;
michael@0 270 rtScriptSetsCount = 2;
michael@0 271 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
michael@0 272 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
michael@0 273 if (outerSet->index != static_cast<uint32_t>(outeri)) {
michael@0 274 // This set was already identified as a duplicate.
michael@0 275 // It will not be allocated a position in the runtime array of ScriptSets.
michael@0 276 continue;
michael@0 277 }
michael@0 278 outerSet->rindex = rtScriptSetsCount++;
michael@0 279 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
michael@0 280 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
michael@0 281 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
michael@0 282 delete innerSet->sset;
michael@0 283 innerSet->scriptSetOwned = FALSE;
michael@0 284 innerSet->sset = outerSet->sset;
michael@0 285 innerSet->index = outeri;
michael@0 286 innerSet->rindex = outerSet->rindex;
michael@0 287 duplicateCount++;
michael@0 288 }
michael@0 289 // But this doesn't get all. We need to fix the TRIE.
michael@0 290 }
michael@0 291 }
michael@0 292 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
michael@0 293 }
michael@0 294
michael@0 295
michael@0 296
michael@0 297 // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
michael@0 298 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
michael@0 299 // are unused, which is why the loop index starts at 2.)
michael@0 300 {
michael@0 301 for (int32_t i=2; i<scriptSets->size(); i++) {
michael@0 302 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
michael@0 303 if (bSet->rindex != (uint32_t)i) {
michael@0 304 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
michael@0 305 }
michael@0 306 }
michael@0 307 }
michael@0 308
michael@0 309 // For code points with script==Common or script==Inherited,
michael@0 310 // Set the reserved value of 1 into both Tries. These characters do not participate
michael@0 311 // in Whole Script Confusable detection; this reserved value is the means
michael@0 312 // by which they are detected.
michael@0 313 {
michael@0 314 UnicodeSet ignoreSet;
michael@0 315 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
michael@0 316 UnicodeSet inheritedSet;
michael@0 317 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
michael@0 318 ignoreSet.addAll(inheritedSet);
michael@0 319 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
michael@0 320 UChar32 rangeStart = ignoreSet.getRangeStart(rn);
michael@0 321 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
michael@0 322 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
michael@0 323 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
michael@0 324 }
michael@0 325 }
michael@0 326
michael@0 327 // Serialize the data to the Spoof Detector
michael@0 328 {
michael@0 329 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
michael@0 330 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
michael@0 331 // printf("Any case Trie size: %d\n", size);
michael@0 332 if (status != U_BUFFER_OVERFLOW_ERROR) {
michael@0 333 goto cleanup;
michael@0 334 }
michael@0 335 status = U_ZERO_ERROR;
michael@0 336 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
michael@0 337 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
michael@0 338 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
michael@0 339 void *where = spImpl->fSpoofData->reserveSpace(size, status);
michael@0 340 utrie2_serialize(anyCaseTrie, where, size, &status);
michael@0 341
michael@0 342 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
michael@0 343 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
michael@0 344 // printf("Lower case Trie size: %d\n", size);
michael@0 345 if (status != U_BUFFER_OVERFLOW_ERROR) {
michael@0 346 goto cleanup;
michael@0 347 }
michael@0 348 status = U_ZERO_ERROR;
michael@0 349 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
michael@0 350 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
michael@0 351 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
michael@0 352 where = spImpl->fSpoofData->reserveSpace(size, status);
michael@0 353 utrie2_serialize(lowerCaseTrie, where, size, &status);
michael@0 354
michael@0 355 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
michael@0 356 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
michael@0 357 ScriptSet *rtScriptSets = static_cast<ScriptSet *>
michael@0 358 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
michael@0 359 uint32_t rindex = 2;
michael@0 360 for (int32_t i=2; i<scriptSets->size(); i++) {
michael@0 361 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
michael@0 362 if (bSet->rindex < rindex) {
michael@0 363 // We have already copied this script set to the serialized data.
michael@0 364 continue;
michael@0 365 }
michael@0 366 U_ASSERT(rindex == bSet->rindex);
michael@0 367 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
michael@0 368 rindex++;
michael@0 369 }
michael@0 370 }
michael@0 371
michael@0 372 // Open new utrie2s from the serialized data. We don't want to keep the ones
michael@0 373 // we just built because we would then have two copies of the data, one internal to
michael@0 374 // the utries that we have already constructed, and one in the serialized data area.
michael@0 375 // An alternative would be to not pre-serialize the Trie data, but that makes the
michael@0 376 // spoof detector data different, depending on how the detector was constructed.
michael@0 377 // It's simpler to keep the data always the same.
michael@0 378
michael@0 379 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
michael@0 380 UTRIE2_16_VALUE_BITS,
michael@0 381 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
michael@0 382 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
michael@0 383 NULL,
michael@0 384 &status);
michael@0 385
michael@0 386 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
michael@0 387 UTRIE2_16_VALUE_BITS,
michael@0 388 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
michael@0 389 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
michael@0 390 NULL,
michael@0 391 &status);
michael@0 392
michael@0 393
michael@0 394
michael@0 395 cleanup:
michael@0 396 if (U_FAILURE(status)) {
michael@0 397 pe->line = lineNum;
michael@0 398 }
michael@0 399 uregex_close(parseRegexp);
michael@0 400 uprv_free(input);
michael@0 401
michael@0 402 int32_t i;
michael@0 403 if (scriptSets != NULL) {
michael@0 404 for (i=0; i<scriptSets->size(); i++) {
michael@0 405 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
michael@0 406 delete bsset;
michael@0 407 }
michael@0 408 delete scriptSets;
michael@0 409 }
michael@0 410 utrie2_close(anyCaseTrie);
michael@0 411 utrie2_close(lowerCaseTrie);
michael@0 412 return;
michael@0 413 }
michael@0 414
michael@0 415 U_NAMESPACE_END
michael@0 416
michael@0 417
michael@0 418
michael@0 419 BuilderScriptSet::BuilderScriptSet() {
michael@0 420 codePoint = -1;
michael@0 421 trie = NULL;
michael@0 422 sset = NULL;
michael@0 423 index = 0;
michael@0 424 rindex = 0;
michael@0 425 scriptSetOwned = TRUE;
michael@0 426 }
michael@0 427
michael@0 428 BuilderScriptSet::~BuilderScriptSet() {
michael@0 429 if (scriptSetOwned) {
michael@0 430 delete sset;
michael@0 431 }
michael@0 432 }
michael@0 433
michael@0 434 #endif
michael@0 435 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 436

mercurial