intl/icu/source/i18n/uspoof_wsconf.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2008-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  uspoof_wsconf.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2009Jan05  (refactoring earlier files)
    14 *   created by: Andy Heninger
    15 *
    16 *   Internal functions for compililing Whole Script confusable source data
    17 *   into its binary (runtime) form.  The binary data format is described
    18 *   in uspoof_impl.h
    19 */
    21 #include "unicode/utypes.h"
    22 #include "unicode/uspoof.h"
    24 #if !UCONFIG_NO_NORMALIZATION
    26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
    28 #include "unicode/unorm.h"
    29 #include "unicode/uregex.h"
    30 #include "unicode/ustring.h"
    31 #include "cmemory.h"
    32 #include "scriptset.h"
    33 #include "uspoof_impl.h"
    34 #include "uhash.h"
    35 #include "uvector.h"
    36 #include "uassert.h"
    37 #include "uspoof_wsconf.h"
    39 U_NAMESPACE_USE
    42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
    43 // Example Lines:
    44 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
    45 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
    46 //    |               |     |    |
    47 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
    48 //    |               |     |----------Target script.   We need this.
    49 //    |               |----------------Src script.  Should match the script of the source
    50 //    |                                code points.  Beyond checking that, we don't keep it.
    51 //    |--------------------------------Source code points or range.
    52 //
    53 // The expression will match _all_ lines, including erroneous lines.
    54 // The result of the parse is returned via the contents of the (match) groups.
    55 static const char *parseExp = 
    56         "(?m)"                                         // Multi-line mode
    57         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
    58         "|^(?:"                                        //   OR
    59         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
    60         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
    61         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
    62         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
    63         "[ \\t]*(?:#.*?)?"                             // Trailing commment
    64         ")$|"                                          //   OR
    65         "^(.*?)$";                                     // An error line.      Group 8.
    66                                                        //    Any line not matching the preceding
    67                                                        //    parts of the expression.will match
    68                                                        //    this, and thus be flagged as an error
    71 // Extract a regular expression match group into a char * string.
    72 //    The group must contain only invariant characters.
    73 //    Used for script names
    74 // 
    75 static void extractGroup(
    76     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
    78     UChar ubuf[50];
    79     ubuf[0] = 0;
    80     destBuf[0] = 0;
    81     int32_t len = uregex_group(e, group, ubuf, 50, &status);
    82     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
    83         return;
    84     }
    85     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
    86     s.extract(0, len, destBuf, destCapacity, US_INV);
    87 }
    91 U_NAMESPACE_BEGIN
    93 //  Build the Whole Script Confusable data
    94 //
    95 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
    96 //                         because everything is local to this one build function anyhow,
    97 //                           OR
    98 //                         break this function into more reasonably sized pieces, with
    99 //                         state in WSConfusableDataBuilder.
   100 //
   101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
   102           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
   103 {
   104     if (U_FAILURE(status)) {
   105         return;
   106     }
   107     URegularExpression *parseRegexp = NULL;
   108     int32_t             inputLen    = 0;
   109     UChar              *input       = NULL;
   110     int32_t             lineNum     = 0;
   112     UVector            *scriptSets        = NULL;
   113     uint32_t            rtScriptSetsCount = 2;
   115     UTrie2             *anyCaseTrie   = NULL;
   116     UTrie2             *lowerCaseTrie = NULL;
   118     anyCaseTrie = utrie2_open(0, 0, &status);
   119     lowerCaseTrie = utrie2_open(0, 0, &status);
   121     UnicodeString pattern(parseExp, -1, US_INV);
   123     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
   124     //
   125     // Reserved TRIE values:
   126     //   0:  Code point has no whole script confusables.
   127     //   1:  Code point is of script Common or Inherited.
   128     //       These code points do not participate in whole script confusable detection.
   129     //       (This is logically equivalent to saying that they contain confusables in
   130     //        all scripts)
   131     //
   132     // Because Trie values are indexes into the ScriptSets vector, pre-fill
   133     // vector positions 0 and 1 to avoid conflicts with the reserved values.
   135     scriptSets = new UVector(status);
   136     if (scriptSets == NULL) {
   137         status = U_MEMORY_ALLOCATION_ERROR;
   138         goto cleanup;
   139     }
   140     scriptSets->addElement((void *)NULL, status);
   141     scriptSets->addElement((void *)NULL, status);
   143     // Convert the user input data from UTF-8 to UChar (UTF-16)
   144     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
   145     if (status != U_BUFFER_OVERFLOW_ERROR) {
   146         goto cleanup;
   147     }
   148     status = U_ZERO_ERROR;
   149     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
   150     if (input == NULL) {
   151         status = U_MEMORY_ALLOCATION_ERROR;
   152         goto cleanup;
   153     }
   154     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
   156     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
   158     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
   159     //   given the syntax of the input.
   160     if (*input == 0xfeff) {
   161         *input = 0x20;
   162     }
   164     // Parse the input, one line per iteration of this loop.
   165     uregex_setText(parseRegexp, input, inputLen, &status);
   166     while (uregex_findNext(parseRegexp, &status)) {
   167         lineNum++;
   168         if (uregex_start(parseRegexp, 1, &status) >= 0) {
   169             // this was a blank or comment line.
   170             continue;
   171         }
   172         if (uregex_start(parseRegexp, 8, &status) >= 0) {
   173             // input file syntax error.
   174             status = U_PARSE_ERROR;
   175             goto cleanup;
   176         }
   177         if (U_FAILURE(status)) {
   178             goto cleanup;
   179         }
   181         // Pick up the start and optional range end code points from the parsed line.
   182         UChar32  startCodePoint = SpoofImpl::ScanHex(
   183             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
   184         UChar32  endCodePoint = startCodePoint;
   185         if (uregex_start(parseRegexp, 3, &status) >=0) {
   186             endCodePoint = SpoofImpl::ScanHex(
   187                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
   188         }
   190         // Extract the two script names from the source line.  We need these in an 8 bit
   191         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
   192         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
   193         char  srcScriptName[20];
   194         char  targScriptName[20];
   195         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
   196         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
   197         UScriptCode srcScript  =
   198             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
   199         UScriptCode targScript =
   200             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
   201         if (U_FAILURE(status)) {
   202             goto cleanup;
   203         }
   204         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
   205             status = U_INVALID_FORMAT_ERROR;
   206             goto cleanup;
   207         }
   209         // select the table - (A) any case or (L) lower case only
   210         UTrie2 *table = anyCaseTrie;
   211         if (uregex_start(parseRegexp, 7, &status) >= 0) {
   212             table = lowerCaseTrie;
   213         }
   215         // Build the set of scripts containing confusable characters for
   216         //   the code point(s) specified in this input line.
   217         // Sanity check that the script of the source code point is the same
   218         //   as the source script indicated in the input file.  Failure of this check is
   219         //   an error in the input file.
   220         // Include the source script in the set (needed for Mixed Script Confusable detection).
   221         //
   222         UChar32 cp;
   223         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
   224             int32_t setIndex = utrie2_get32(table, cp);
   225             BuilderScriptSet *bsset = NULL;
   226             if (setIndex > 0) {
   227                 U_ASSERT(setIndex < scriptSets->size());
   228                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
   229             } else {
   230                 bsset = new BuilderScriptSet();
   231                 if (bsset == NULL) {
   232                     status = U_MEMORY_ALLOCATION_ERROR;
   233                     goto cleanup;
   234                 }
   235                 bsset->codePoint = cp;
   236                 bsset->trie = table;
   237                 bsset->sset = new ScriptSet();
   238                 setIndex = scriptSets->size();
   239                 bsset->index = setIndex;
   240                 bsset->rindex = 0;
   241                 if (bsset->sset == NULL) {
   242                     status = U_MEMORY_ALLOCATION_ERROR;
   243                     goto cleanup;
   244                 }
   245                 scriptSets->addElement(bsset, status);
   246                 utrie2_set32(table, cp, setIndex, &status);
   247             }
   248             bsset->sset->set(targScript, status);
   249             bsset->sset->set(srcScript, status);
   251             if (U_FAILURE(status)) {
   252                 goto cleanup;
   253             }
   254             UScriptCode cpScript = uscript_getScript(cp, &status);
   255             if (cpScript != srcScript) {
   256                 status = U_INVALID_FORMAT_ERROR;
   257                 goto cleanup;
   258             }
   259         }
   260     }
   262     // Eliminate duplicate script sets.  At this point we have a separate
   263     // script set for every code point that had data in the input file.
   264     //
   265     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
   266     //
   267     // printf("Number of scriptSets: %d\n", scriptSets->size());
   268     {
   269         int32_t duplicateCount = 0;
   270         rtScriptSetsCount = 2;
   271         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
   272             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
   273             if (outerSet->index != static_cast<uint32_t>(outeri)) {
   274                 // This set was already identified as a duplicate.
   275                 //   It will not be allocated a position in the runtime array of ScriptSets.
   276                 continue;
   277             }
   278             outerSet->rindex = rtScriptSetsCount++;
   279             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
   280                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
   281                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
   282                     delete innerSet->sset;
   283                     innerSet->scriptSetOwned = FALSE;
   284                     innerSet->sset = outerSet->sset;
   285                     innerSet->index = outeri;
   286                     innerSet->rindex = outerSet->rindex;
   287                     duplicateCount++;
   288                 }
   289                 // But this doesn't get all.  We need to fix the TRIE.
   290             }
   291         }
   292         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
   293     }
   297     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
   298     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
   299     //     are unused, which is why the loop index starts at 2.)
   300     {
   301         for (int32_t i=2; i<scriptSets->size(); i++) {
   302             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   303             if (bSet->rindex != (uint32_t)i) {
   304                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
   305             }
   306         }
   307     }
   309     // For code points with script==Common or script==Inherited,
   310     //   Set the reserved value of 1 into both Tries.  These characters do not participate
   311     //   in Whole Script Confusable detection; this reserved value is the means
   312     //   by which they are detected.
   313     {
   314         UnicodeSet ignoreSet;
   315         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
   316         UnicodeSet inheritedSet;
   317         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
   318         ignoreSet.addAll(inheritedSet);
   319         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
   320             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
   321             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
   322             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
   323             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
   324         }
   325     }
   327     // Serialize the data to the Spoof Detector
   328     {
   329         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
   330         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
   331         // printf("Any case Trie size: %d\n", size);
   332         if (status != U_BUFFER_OVERFLOW_ERROR) {
   333             goto cleanup;
   334         }
   335         status = U_ZERO_ERROR;
   336         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
   337         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
   338         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
   339         void *where = spImpl->fSpoofData->reserveSpace(size, status);
   340         utrie2_serialize(anyCaseTrie, where, size, &status);
   342         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
   343         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
   344         // printf("Lower case Trie size: %d\n", size);
   345         if (status != U_BUFFER_OVERFLOW_ERROR) {
   346             goto cleanup;
   347         }
   348         status = U_ZERO_ERROR;
   349         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
   350         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
   351         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
   352         where = spImpl->fSpoofData->reserveSpace(size, status);
   353         utrie2_serialize(lowerCaseTrie, where, size, &status);
   355         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
   356         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
   357         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
   358             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
   359         uint32_t rindex = 2;
   360         for (int32_t i=2; i<scriptSets->size(); i++) {
   361             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   362             if (bSet->rindex < rindex) {
   363                 // We have already copied this script set to the serialized data.
   364                 continue;
   365             }
   366             U_ASSERT(rindex == bSet->rindex);
   367             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
   368             rindex++;
   369         }
   370     }
   372     // Open new utrie2s from the serialized data.  We don't want to keep the ones
   373     //   we just built because we would then have two copies of the data, one internal to
   374     //   the utries that we have already constructed, and one in the serialized data area.
   375     //   An alternative would be to not pre-serialize the Trie data, but that makes the
   376     //   spoof detector data different, depending on how the detector was constructed.
   377     //   It's simpler to keep the data always the same.
   379     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
   380             UTRIE2_16_VALUE_BITS,
   381             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
   382             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
   383             NULL,
   384             &status);
   386     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
   387             UTRIE2_16_VALUE_BITS,
   388             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
   389             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
   390             NULL,
   391             &status);
   395 cleanup:
   396     if (U_FAILURE(status)) {
   397         pe->line = lineNum;
   398     }
   399     uregex_close(parseRegexp);
   400     uprv_free(input);
   402     int32_t i;
   403     if (scriptSets != NULL) {
   404         for (i=0; i<scriptSets->size(); i++) {
   405             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   406             delete bsset;
   407         }
   408         delete scriptSets;
   409     }
   410     utrie2_close(anyCaseTrie);
   411     utrie2_close(lowerCaseTrie);
   412     return;
   413 }
   415 U_NAMESPACE_END
   419 BuilderScriptSet::BuilderScriptSet() {
   420     codePoint = -1;
   421     trie = NULL;
   422     sset = NULL;
   423     index = 0;
   424     rindex = 0;
   425     scriptSetOwned = TRUE;
   426 }
   428 BuilderScriptSet::~BuilderScriptSet() {
   429     if (scriptSetOwned) {
   430         delete sset;
   431     }
   432 }
   434 #endif
   435 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS 

mercurial