1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/uspoof_wsconf.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,436 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2008-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: uspoof_wsconf.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009Jan05 (refactoring earlier files) 1.17 +* created by: Andy Heninger 1.18 +* 1.19 +* Internal functions for compililing Whole Script confusable source data 1.20 +* into its binary (runtime) form. The binary data format is described 1.21 +* in uspoof_impl.h 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 +#include "unicode/uspoof.h" 1.26 + 1.27 +#if !UCONFIG_NO_NORMALIZATION 1.28 + 1.29 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.30 + 1.31 +#include "unicode/unorm.h" 1.32 +#include "unicode/uregex.h" 1.33 +#include "unicode/ustring.h" 1.34 +#include "cmemory.h" 1.35 +#include "scriptset.h" 1.36 +#include "uspoof_impl.h" 1.37 +#include "uhash.h" 1.38 +#include "uvector.h" 1.39 +#include "uassert.h" 1.40 +#include "uspoof_wsconf.h" 1.41 + 1.42 +U_NAMESPACE_USE 1.43 + 1.44 + 1.45 +// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt 1.46 +// Example Lines: 1.47 +// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O 1.48 +// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I 1.49 +// | | | | 1.50 +// | | | |---- Which table, Any Case or Lower Case (A or L) 1.51 +// | | |----------Target script. We need this. 1.52 +// | |----------------Src script. Should match the script of the source 1.53 +// | code points. Beyond checking that, we don't keep it. 1.54 +// |--------------------------------Source code points or range. 1.55 +// 1.56 +// The expression will match _all_ lines, including erroneous lines. 1.57 +// The result of the parse is returned via the contents of the (match) groups. 1.58 +static const char *parseExp = 1.59 + "(?m)" // Multi-line mode 1.60 + "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. 1.61 + "|^(?:" // OR 1.62 + "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. 1.63 + "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. 1.64 + "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. 1.65 + "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 1.66 + "[ \\t]*(?:#.*?)?" // Trailing commment 1.67 + ")$|" // OR 1.68 + "^(.*?)$"; // An error line. Group 8. 1.69 + // Any line not matching the preceding 1.70 + // parts of the expression.will match 1.71 + // this, and thus be flagged as an error 1.72 + 1.73 + 1.74 +// Extract a regular expression match group into a char * string. 1.75 +// The group must contain only invariant characters. 1.76 +// Used for script names 1.77 +// 1.78 +static void extractGroup( 1.79 + URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { 1.80 + 1.81 + UChar ubuf[50]; 1.82 + ubuf[0] = 0; 1.83 + destBuf[0] = 0; 1.84 + int32_t len = uregex_group(e, group, ubuf, 50, &status); 1.85 + if (U_FAILURE(status) || len == -1 || len >= destCapacity) { 1.86 + return; 1.87 + } 1.88 + UnicodeString s(FALSE, ubuf, len); // Aliasing constructor 1.89 + s.extract(0, len, destBuf, destCapacity, US_INV); 1.90 +} 1.91 + 1.92 + 1.93 + 1.94 +U_NAMESPACE_BEGIN 1.95 + 1.96 +// Build the Whole Script Confusable data 1.97 +// 1.98 +// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, 1.99 +// because everything is local to this one build function anyhow, 1.100 +// OR 1.101 +// break this function into more reasonably sized pieces, with 1.102 +// state in WSConfusableDataBuilder. 1.103 +// 1.104 +void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, 1.105 + int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 1.106 +{ 1.107 + if (U_FAILURE(status)) { 1.108 + return; 1.109 + } 1.110 + URegularExpression *parseRegexp = NULL; 1.111 + int32_t inputLen = 0; 1.112 + UChar *input = NULL; 1.113 + int32_t lineNum = 0; 1.114 + 1.115 + UVector *scriptSets = NULL; 1.116 + uint32_t rtScriptSetsCount = 2; 1.117 + 1.118 + UTrie2 *anyCaseTrie = NULL; 1.119 + UTrie2 *lowerCaseTrie = NULL; 1.120 + 1.121 + anyCaseTrie = utrie2_open(0, 0, &status); 1.122 + lowerCaseTrie = utrie2_open(0, 0, &status); 1.123 + 1.124 + UnicodeString pattern(parseExp, -1, US_INV); 1.125 + 1.126 + // The scriptSets vector provides a mapping from TRIE values to the set of scripts. 1.127 + // 1.128 + // Reserved TRIE values: 1.129 + // 0: Code point has no whole script confusables. 1.130 + // 1: Code point is of script Common or Inherited. 1.131 + // These code points do not participate in whole script confusable detection. 1.132 + // (This is logically equivalent to saying that they contain confusables in 1.133 + // all scripts) 1.134 + // 1.135 + // Because Trie values are indexes into the ScriptSets vector, pre-fill 1.136 + // vector positions 0 and 1 to avoid conflicts with the reserved values. 1.137 + 1.138 + scriptSets = new UVector(status); 1.139 + if (scriptSets == NULL) { 1.140 + status = U_MEMORY_ALLOCATION_ERROR; 1.141 + goto cleanup; 1.142 + } 1.143 + scriptSets->addElement((void *)NULL, status); 1.144 + scriptSets->addElement((void *)NULL, status); 1.145 + 1.146 + // Convert the user input data from UTF-8 to UChar (UTF-16) 1.147 + u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); 1.148 + if (status != U_BUFFER_OVERFLOW_ERROR) { 1.149 + goto cleanup; 1.150 + } 1.151 + status = U_ZERO_ERROR; 1.152 + input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 1.153 + if (input == NULL) { 1.154 + status = U_MEMORY_ALLOCATION_ERROR; 1.155 + goto cleanup; 1.156 + } 1.157 + u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); 1.158 + 1.159 + parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 1.160 + 1.161 + // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 1.162 + // given the syntax of the input. 1.163 + if (*input == 0xfeff) { 1.164 + *input = 0x20; 1.165 + } 1.166 + 1.167 + // Parse the input, one line per iteration of this loop. 1.168 + uregex_setText(parseRegexp, input, inputLen, &status); 1.169 + while (uregex_findNext(parseRegexp, &status)) { 1.170 + lineNum++; 1.171 + if (uregex_start(parseRegexp, 1, &status) >= 0) { 1.172 + // this was a blank or comment line. 1.173 + continue; 1.174 + } 1.175 + if (uregex_start(parseRegexp, 8, &status) >= 0) { 1.176 + // input file syntax error. 1.177 + status = U_PARSE_ERROR; 1.178 + goto cleanup; 1.179 + } 1.180 + if (U_FAILURE(status)) { 1.181 + goto cleanup; 1.182 + } 1.183 + 1.184 + // Pick up the start and optional range end code points from the parsed line. 1.185 + UChar32 startCodePoint = SpoofImpl::ScanHex( 1.186 + input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); 1.187 + UChar32 endCodePoint = startCodePoint; 1.188 + if (uregex_start(parseRegexp, 3, &status) >=0) { 1.189 + endCodePoint = SpoofImpl::ScanHex( 1.190 + input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); 1.191 + } 1.192 + 1.193 + // Extract the two script names from the source line. We need these in an 8 bit 1.194 + // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on 1.195 + // to the ICU u_getPropertyValueEnum() function. Ugh. 1.196 + char srcScriptName[20]; 1.197 + char targScriptName[20]; 1.198 + extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); 1.199 + extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); 1.200 + UScriptCode srcScript = 1.201 + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); 1.202 + UScriptCode targScript = 1.203 + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); 1.204 + if (U_FAILURE(status)) { 1.205 + goto cleanup; 1.206 + } 1.207 + if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { 1.208 + status = U_INVALID_FORMAT_ERROR; 1.209 + goto cleanup; 1.210 + } 1.211 + 1.212 + // select the table - (A) any case or (L) lower case only 1.213 + UTrie2 *table = anyCaseTrie; 1.214 + if (uregex_start(parseRegexp, 7, &status) >= 0) { 1.215 + table = lowerCaseTrie; 1.216 + } 1.217 + 1.218 + // Build the set of scripts containing confusable characters for 1.219 + // the code point(s) specified in this input line. 1.220 + // Sanity check that the script of the source code point is the same 1.221 + // as the source script indicated in the input file. Failure of this check is 1.222 + // an error in the input file. 1.223 + // Include the source script in the set (needed for Mixed Script Confusable detection). 1.224 + // 1.225 + UChar32 cp; 1.226 + for (cp=startCodePoint; cp<=endCodePoint; cp++) { 1.227 + int32_t setIndex = utrie2_get32(table, cp); 1.228 + BuilderScriptSet *bsset = NULL; 1.229 + if (setIndex > 0) { 1.230 + U_ASSERT(setIndex < scriptSets->size()); 1.231 + bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); 1.232 + } else { 1.233 + bsset = new BuilderScriptSet(); 1.234 + if (bsset == NULL) { 1.235 + status = U_MEMORY_ALLOCATION_ERROR; 1.236 + goto cleanup; 1.237 + } 1.238 + bsset->codePoint = cp; 1.239 + bsset->trie = table; 1.240 + bsset->sset = new ScriptSet(); 1.241 + setIndex = scriptSets->size(); 1.242 + bsset->index = setIndex; 1.243 + bsset->rindex = 0; 1.244 + if (bsset->sset == NULL) { 1.245 + status = U_MEMORY_ALLOCATION_ERROR; 1.246 + goto cleanup; 1.247 + } 1.248 + scriptSets->addElement(bsset, status); 1.249 + utrie2_set32(table, cp, setIndex, &status); 1.250 + } 1.251 + bsset->sset->set(targScript, status); 1.252 + bsset->sset->set(srcScript, status); 1.253 + 1.254 + if (U_FAILURE(status)) { 1.255 + goto cleanup; 1.256 + } 1.257 + UScriptCode cpScript = uscript_getScript(cp, &status); 1.258 + if (cpScript != srcScript) { 1.259 + status = U_INVALID_FORMAT_ERROR; 1.260 + goto cleanup; 1.261 + } 1.262 + } 1.263 + } 1.264 + 1.265 + // Eliminate duplicate script sets. At this point we have a separate 1.266 + // script set for every code point that had data in the input file. 1.267 + // 1.268 + // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them 1.269 + // 1.270 + // printf("Number of scriptSets: %d\n", scriptSets->size()); 1.271 + { 1.272 + int32_t duplicateCount = 0; 1.273 + rtScriptSetsCount = 2; 1.274 + for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { 1.275 + BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); 1.276 + if (outerSet->index != static_cast<uint32_t>(outeri)) { 1.277 + // This set was already identified as a duplicate. 1.278 + // It will not be allocated a position in the runtime array of ScriptSets. 1.279 + continue; 1.280 + } 1.281 + outerSet->rindex = rtScriptSetsCount++; 1.282 + for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { 1.283 + BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); 1.284 + if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { 1.285 + delete innerSet->sset; 1.286 + innerSet->scriptSetOwned = FALSE; 1.287 + innerSet->sset = outerSet->sset; 1.288 + innerSet->index = outeri; 1.289 + innerSet->rindex = outerSet->rindex; 1.290 + duplicateCount++; 1.291 + } 1.292 + // But this doesn't get all. We need to fix the TRIE. 1.293 + } 1.294 + } 1.295 + // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); 1.296 + } 1.297 + 1.298 + 1.299 + 1.300 + // Update the Trie values to be reflect the run time script indexes (after duplicate merging). 1.301 + // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets 1.302 + // are unused, which is why the loop index starts at 2.) 1.303 + { 1.304 + for (int32_t i=2; i<scriptSets->size(); i++) { 1.305 + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 1.306 + if (bSet->rindex != (uint32_t)i) { 1.307 + utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); 1.308 + } 1.309 + } 1.310 + } 1.311 + 1.312 + // For code points with script==Common or script==Inherited, 1.313 + // Set the reserved value of 1 into both Tries. These characters do not participate 1.314 + // in Whole Script Confusable detection; this reserved value is the means 1.315 + // by which they are detected. 1.316 + { 1.317 + UnicodeSet ignoreSet; 1.318 + ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 1.319 + UnicodeSet inheritedSet; 1.320 + inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 1.321 + ignoreSet.addAll(inheritedSet); 1.322 + for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { 1.323 + UChar32 rangeStart = ignoreSet.getRangeStart(rn); 1.324 + UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); 1.325 + utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 1.326 + utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 1.327 + } 1.328 + } 1.329 + 1.330 + // Serialize the data to the Spoof Detector 1.331 + { 1.332 + utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); 1.333 + int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); 1.334 + // printf("Any case Trie size: %d\n", size); 1.335 + if (status != U_BUFFER_OVERFLOW_ERROR) { 1.336 + goto cleanup; 1.337 + } 1.338 + status = U_ZERO_ERROR; 1.339 + spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; 1.340 + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; 1.341 + spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; 1.342 + void *where = spImpl->fSpoofData->reserveSpace(size, status); 1.343 + utrie2_serialize(anyCaseTrie, where, size, &status); 1.344 + 1.345 + utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); 1.346 + size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); 1.347 + // printf("Lower case Trie size: %d\n", size); 1.348 + if (status != U_BUFFER_OVERFLOW_ERROR) { 1.349 + goto cleanup; 1.350 + } 1.351 + status = U_ZERO_ERROR; 1.352 + spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; 1.353 + spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; 1.354 + spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; 1.355 + where = spImpl->fSpoofData->reserveSpace(size, status); 1.356 + utrie2_serialize(lowerCaseTrie, where, size, &status); 1.357 + 1.358 + spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; 1.359 + spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; 1.360 + ScriptSet *rtScriptSets = static_cast<ScriptSet *> 1.361 + (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); 1.362 + uint32_t rindex = 2; 1.363 + for (int32_t i=2; i<scriptSets->size(); i++) { 1.364 + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 1.365 + if (bSet->rindex < rindex) { 1.366 + // We have already copied this script set to the serialized data. 1.367 + continue; 1.368 + } 1.369 + U_ASSERT(rindex == bSet->rindex); 1.370 + rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. 1.371 + rindex++; 1.372 + } 1.373 + } 1.374 + 1.375 + // Open new utrie2s from the serialized data. We don't want to keep the ones 1.376 + // we just built because we would then have two copies of the data, one internal to 1.377 + // the utries that we have already constructed, and one in the serialized data area. 1.378 + // An alternative would be to not pre-serialize the Trie data, but that makes the 1.379 + // spoof detector data different, depending on how the detector was constructed. 1.380 + // It's simpler to keep the data always the same. 1.381 + 1.382 + spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( 1.383 + UTRIE2_16_VALUE_BITS, 1.384 + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, 1.385 + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 1.386 + NULL, 1.387 + &status); 1.388 + 1.389 + spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( 1.390 + UTRIE2_16_VALUE_BITS, 1.391 + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, 1.392 + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 1.393 + NULL, 1.394 + &status); 1.395 + 1.396 + 1.397 + 1.398 +cleanup: 1.399 + if (U_FAILURE(status)) { 1.400 + pe->line = lineNum; 1.401 + } 1.402 + uregex_close(parseRegexp); 1.403 + uprv_free(input); 1.404 + 1.405 + int32_t i; 1.406 + if (scriptSets != NULL) { 1.407 + for (i=0; i<scriptSets->size(); i++) { 1.408 + BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 1.409 + delete bsset; 1.410 + } 1.411 + delete scriptSets; 1.412 + } 1.413 + utrie2_close(anyCaseTrie); 1.414 + utrie2_close(lowerCaseTrie); 1.415 + return; 1.416 +} 1.417 + 1.418 +U_NAMESPACE_END 1.419 + 1.420 + 1.421 + 1.422 +BuilderScriptSet::BuilderScriptSet() { 1.423 + codePoint = -1; 1.424 + trie = NULL; 1.425 + sset = NULL; 1.426 + index = 0; 1.427 + rindex = 0; 1.428 + scriptSetOwned = TRUE; 1.429 +} 1.430 + 1.431 +BuilderScriptSet::~BuilderScriptSet() { 1.432 + if (scriptSetOwned) { 1.433 + delete sset; 1.434 + } 1.435 +} 1.436 + 1.437 +#endif 1.438 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1.439 +