michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2008-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: uspoof_wsconf.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009Jan05 (refactoring earlier files) michael@0: * created by: Andy Heninger michael@0: * michael@0: * Internal functions for compililing Whole Script confusable source data michael@0: * into its binary (runtime) form. The binary data format is described michael@0: * in uspoof_impl.h michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uspoof.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #if !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/uregex.h" michael@0: #include "unicode/ustring.h" michael@0: #include "cmemory.h" michael@0: #include "scriptset.h" michael@0: #include "uspoof_impl.h" michael@0: #include "uhash.h" michael@0: #include "uvector.h" michael@0: #include "uassert.h" michael@0: #include "uspoof_wsconf.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: michael@0: // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt michael@0: // Example Lines: michael@0: // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O michael@0: // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I michael@0: // | | | | michael@0: // | | | |---- Which table, Any Case or Lower Case (A or L) michael@0: // | | |----------Target script. We need this. michael@0: // | |----------------Src script. Should match the script of the source michael@0: // | code points. Beyond checking that, we don't keep it. michael@0: // |--------------------------------Source code points or range. michael@0: // michael@0: // The expression will match _all_ lines, including erroneous lines. michael@0: // The result of the parse is returned via the contents of the (match) groups. michael@0: static const char *parseExp = michael@0: "(?m)" // Multi-line mode michael@0: "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. michael@0: "|^(?:" // OR michael@0: "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. michael@0: "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. michael@0: "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. michael@0: "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 michael@0: "[ \\t]*(?:#.*?)?" // Trailing commment michael@0: ")$|" // OR michael@0: "^(.*?)$"; // An error line. Group 8. michael@0: // Any line not matching the preceding michael@0: // parts of the expression.will match michael@0: // this, and thus be flagged as an error michael@0: michael@0: michael@0: // Extract a regular expression match group into a char * string. michael@0: // The group must contain only invariant characters. michael@0: // Used for script names michael@0: // michael@0: static void extractGroup( michael@0: URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { michael@0: michael@0: UChar ubuf[50]; michael@0: ubuf[0] = 0; michael@0: destBuf[0] = 0; michael@0: int32_t len = uregex_group(e, group, ubuf, 50, &status); michael@0: if (U_FAILURE(status) || len == -1 || len >= destCapacity) { michael@0: return; michael@0: } michael@0: UnicodeString s(FALSE, ubuf, len); // Aliasing constructor michael@0: s.extract(0, len, destBuf, destCapacity, US_INV); michael@0: } michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: // Build the Whole Script Confusable data michael@0: // michael@0: // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, michael@0: // because everything is local to this one build function anyhow, michael@0: // OR michael@0: // break this function into more reasonably sized pieces, with michael@0: // state in WSConfusableDataBuilder. michael@0: // michael@0: void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, michael@0: int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: URegularExpression *parseRegexp = NULL; michael@0: int32_t inputLen = 0; michael@0: UChar *input = NULL; michael@0: int32_t lineNum = 0; michael@0: michael@0: UVector *scriptSets = NULL; michael@0: uint32_t rtScriptSetsCount = 2; michael@0: michael@0: UTrie2 *anyCaseTrie = NULL; michael@0: UTrie2 *lowerCaseTrie = NULL; michael@0: michael@0: anyCaseTrie = utrie2_open(0, 0, &status); michael@0: lowerCaseTrie = utrie2_open(0, 0, &status); michael@0: michael@0: UnicodeString pattern(parseExp, -1, US_INV); michael@0: michael@0: // The scriptSets vector provides a mapping from TRIE values to the set of scripts. michael@0: // michael@0: // Reserved TRIE values: michael@0: // 0: Code point has no whole script confusables. michael@0: // 1: Code point is of script Common or Inherited. michael@0: // These code points do not participate in whole script confusable detection. michael@0: // (This is logically equivalent to saying that they contain confusables in michael@0: // all scripts) michael@0: // michael@0: // Because Trie values are indexes into the ScriptSets vector, pre-fill michael@0: // vector positions 0 and 1 to avoid conflicts with the reserved values. michael@0: michael@0: scriptSets = new UVector(status); michael@0: if (scriptSets == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: goto cleanup; michael@0: } michael@0: scriptSets->addElement((void *)NULL, status); michael@0: scriptSets->addElement((void *)NULL, status); michael@0: michael@0: // Convert the user input data from UTF-8 to UChar (UTF-16) michael@0: u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); michael@0: if (status != U_BUFFER_OVERFLOW_ERROR) { michael@0: goto cleanup; michael@0: } michael@0: status = U_ZERO_ERROR; michael@0: input = static_cast(uprv_malloc((inputLen+1) * sizeof(UChar))); michael@0: if (input == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: goto cleanup; michael@0: } michael@0: u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); michael@0: michael@0: parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); michael@0: michael@0: // Zap any Byte Order Mark at the start of input. Changing it to a space is benign michael@0: // given the syntax of the input. michael@0: if (*input == 0xfeff) { michael@0: *input = 0x20; michael@0: } michael@0: michael@0: // Parse the input, one line per iteration of this loop. michael@0: uregex_setText(parseRegexp, input, inputLen, &status); michael@0: while (uregex_findNext(parseRegexp, &status)) { michael@0: lineNum++; michael@0: if (uregex_start(parseRegexp, 1, &status) >= 0) { michael@0: // this was a blank or comment line. michael@0: continue; michael@0: } michael@0: if (uregex_start(parseRegexp, 8, &status) >= 0) { michael@0: // input file syntax error. michael@0: status = U_PARSE_ERROR; michael@0: goto cleanup; michael@0: } michael@0: if (U_FAILURE(status)) { michael@0: goto cleanup; michael@0: } michael@0: michael@0: // Pick up the start and optional range end code points from the parsed line. michael@0: UChar32 startCodePoint = SpoofImpl::ScanHex( michael@0: input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); michael@0: UChar32 endCodePoint = startCodePoint; michael@0: if (uregex_start(parseRegexp, 3, &status) >=0) { michael@0: endCodePoint = SpoofImpl::ScanHex( michael@0: input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); michael@0: } michael@0: michael@0: // Extract the two script names from the source line. We need these in an 8 bit michael@0: // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on michael@0: // to the ICU u_getPropertyValueEnum() function. Ugh. michael@0: char srcScriptName[20]; michael@0: char targScriptName[20]; michael@0: extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); michael@0: extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); michael@0: UScriptCode srcScript = michael@0: static_cast(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); michael@0: UScriptCode targScript = michael@0: static_cast(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); michael@0: if (U_FAILURE(status)) { michael@0: goto cleanup; michael@0: } michael@0: if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: goto cleanup; michael@0: } michael@0: michael@0: // select the table - (A) any case or (L) lower case only michael@0: UTrie2 *table = anyCaseTrie; michael@0: if (uregex_start(parseRegexp, 7, &status) >= 0) { michael@0: table = lowerCaseTrie; michael@0: } michael@0: michael@0: // Build the set of scripts containing confusable characters for michael@0: // the code point(s) specified in this input line. michael@0: // Sanity check that the script of the source code point is the same michael@0: // as the source script indicated in the input file. Failure of this check is michael@0: // an error in the input file. michael@0: // Include the source script in the set (needed for Mixed Script Confusable detection). michael@0: // michael@0: UChar32 cp; michael@0: for (cp=startCodePoint; cp<=endCodePoint; cp++) { michael@0: int32_t setIndex = utrie2_get32(table, cp); michael@0: BuilderScriptSet *bsset = NULL; michael@0: if (setIndex > 0) { michael@0: U_ASSERT(setIndex < scriptSets->size()); michael@0: bsset = static_cast(scriptSets->elementAt(setIndex)); michael@0: } else { michael@0: bsset = new BuilderScriptSet(); michael@0: if (bsset == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: goto cleanup; michael@0: } michael@0: bsset->codePoint = cp; michael@0: bsset->trie = table; michael@0: bsset->sset = new ScriptSet(); michael@0: setIndex = scriptSets->size(); michael@0: bsset->index = setIndex; michael@0: bsset->rindex = 0; michael@0: if (bsset->sset == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: goto cleanup; michael@0: } michael@0: scriptSets->addElement(bsset, status); michael@0: utrie2_set32(table, cp, setIndex, &status); michael@0: } michael@0: bsset->sset->set(targScript, status); michael@0: bsset->sset->set(srcScript, status); michael@0: michael@0: if (U_FAILURE(status)) { michael@0: goto cleanup; michael@0: } michael@0: UScriptCode cpScript = uscript_getScript(cp, &status); michael@0: if (cpScript != srcScript) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: goto cleanup; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Eliminate duplicate script sets. At this point we have a separate michael@0: // script set for every code point that had data in the input file. michael@0: // michael@0: // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them michael@0: // michael@0: // printf("Number of scriptSets: %d\n", scriptSets->size()); michael@0: { michael@0: int32_t duplicateCount = 0; michael@0: rtScriptSetsCount = 2; michael@0: for (int32_t outeri=2; outerisize(); outeri++) { michael@0: BuilderScriptSet *outerSet = static_cast(scriptSets->elementAt(outeri)); michael@0: if (outerSet->index != static_cast(outeri)) { michael@0: // This set was already identified as a duplicate. michael@0: // It will not be allocated a position in the runtime array of ScriptSets. michael@0: continue; michael@0: } michael@0: outerSet->rindex = rtScriptSetsCount++; michael@0: for (int32_t inneri=outeri+1; innerisize(); inneri++) { michael@0: BuilderScriptSet *innerSet = static_cast(scriptSets->elementAt(inneri)); michael@0: if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { michael@0: delete innerSet->sset; michael@0: innerSet->scriptSetOwned = FALSE; michael@0: innerSet->sset = outerSet->sset; michael@0: innerSet->index = outeri; michael@0: innerSet->rindex = outerSet->rindex; michael@0: duplicateCount++; michael@0: } michael@0: // But this doesn't get all. We need to fix the TRIE. michael@0: } michael@0: } michael@0: // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); michael@0: } michael@0: michael@0: michael@0: michael@0: // Update the Trie values to be reflect the run time script indexes (after duplicate merging). michael@0: // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets michael@0: // are unused, which is why the loop index starts at 2.) michael@0: { michael@0: for (int32_t i=2; isize(); i++) { michael@0: BuilderScriptSet *bSet = static_cast(scriptSets->elementAt(i)); michael@0: if (bSet->rindex != (uint32_t)i) { michael@0: utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // For code points with script==Common or script==Inherited, michael@0: // Set the reserved value of 1 into both Tries. These characters do not participate michael@0: // in Whole Script Confusable detection; this reserved value is the means michael@0: // by which they are detected. michael@0: { michael@0: UnicodeSet ignoreSet; michael@0: ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); michael@0: UnicodeSet inheritedSet; michael@0: inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); michael@0: ignoreSet.addAll(inheritedSet); michael@0: for (int32_t rn=0; rnfSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; michael@0: spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; michael@0: spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; michael@0: void *where = spImpl->fSpoofData->reserveSpace(size, status); michael@0: utrie2_serialize(anyCaseTrie, where, size, &status); michael@0: michael@0: utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); michael@0: size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); michael@0: // printf("Lower case Trie size: %d\n", size); michael@0: if (status != U_BUFFER_OVERFLOW_ERROR) { michael@0: goto cleanup; michael@0: } michael@0: status = U_ZERO_ERROR; michael@0: spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; michael@0: spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; michael@0: spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; michael@0: where = spImpl->fSpoofData->reserveSpace(size, status); michael@0: utrie2_serialize(lowerCaseTrie, where, size, &status); michael@0: michael@0: spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; michael@0: spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; michael@0: ScriptSet *rtScriptSets = static_cast michael@0: (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); michael@0: uint32_t rindex = 2; michael@0: for (int32_t i=2; isize(); i++) { michael@0: BuilderScriptSet *bSet = static_cast(scriptSets->elementAt(i)); michael@0: if (bSet->rindex < rindex) { michael@0: // We have already copied this script set to the serialized data. michael@0: continue; michael@0: } michael@0: U_ASSERT(rindex == bSet->rindex); michael@0: rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. michael@0: rindex++; michael@0: } michael@0: } michael@0: michael@0: // Open new utrie2s from the serialized data. We don't want to keep the ones michael@0: // we just built because we would then have two copies of the data, one internal to michael@0: // the utries that we have already constructed, and one in the serialized data area. michael@0: // An alternative would be to not pre-serialize the Trie data, but that makes the michael@0: // spoof detector data different, depending on how the detector was constructed. michael@0: // It's simpler to keep the data always the same. michael@0: michael@0: spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( michael@0: UTRIE2_16_VALUE_BITS, michael@0: (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, michael@0: spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, michael@0: NULL, michael@0: &status); michael@0: michael@0: spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( michael@0: UTRIE2_16_VALUE_BITS, michael@0: (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, michael@0: spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, michael@0: NULL, michael@0: &status); michael@0: michael@0: michael@0: michael@0: cleanup: michael@0: if (U_FAILURE(status)) { michael@0: pe->line = lineNum; michael@0: } michael@0: uregex_close(parseRegexp); michael@0: uprv_free(input); michael@0: michael@0: int32_t i; michael@0: if (scriptSets != NULL) { michael@0: for (i=0; isize(); i++) { michael@0: BuilderScriptSet *bsset = static_cast(scriptSets->elementAt(i)); michael@0: delete bsset; michael@0: } michael@0: delete scriptSets; michael@0: } michael@0: utrie2_close(anyCaseTrie); michael@0: utrie2_close(lowerCaseTrie); michael@0: return; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: michael@0: michael@0: BuilderScriptSet::BuilderScriptSet() { michael@0: codePoint = -1; michael@0: trie = NULL; michael@0: sset = NULL; michael@0: index = 0; michael@0: rindex = 0; michael@0: scriptSetOwned = TRUE; michael@0: } michael@0: michael@0: BuilderScriptSet::~BuilderScriptSet() { michael@0: if (scriptSetOwned) { michael@0: delete sset; michael@0: } michael@0: } michael@0: michael@0: #endif michael@0: #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: