intl/icu/source/i18n/uspoof_wsconf.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/uspoof_wsconf.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,436 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2008-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  uspoof_wsconf.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009Jan05  (refactoring earlier files)
    1.17 +*   created by: Andy Heninger
    1.18 +*
    1.19 +*   Internal functions for compililing Whole Script confusable source data
    1.20 +*   into its binary (runtime) form.  The binary data format is described
    1.21 +*   in uspoof_impl.h
    1.22 +*/
    1.23 +
    1.24 +#include "unicode/utypes.h"
    1.25 +#include "unicode/uspoof.h"
    1.26 +
    1.27 +#if !UCONFIG_NO_NORMALIZATION
    1.28 +
    1.29 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 
    1.30 +
    1.31 +#include "unicode/unorm.h"
    1.32 +#include "unicode/uregex.h"
    1.33 +#include "unicode/ustring.h"
    1.34 +#include "cmemory.h"
    1.35 +#include "scriptset.h"
    1.36 +#include "uspoof_impl.h"
    1.37 +#include "uhash.h"
    1.38 +#include "uvector.h"
    1.39 +#include "uassert.h"
    1.40 +#include "uspoof_wsconf.h"
    1.41 +
    1.42 +U_NAMESPACE_USE
    1.43 +
    1.44 +
    1.45 +// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
    1.46 +// Example Lines:
    1.47 +//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
    1.48 +//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
    1.49 +//    |               |     |    |
    1.50 +//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
    1.51 +//    |               |     |----------Target script.   We need this.
    1.52 +//    |               |----------------Src script.  Should match the script of the source
    1.53 +//    |                                code points.  Beyond checking that, we don't keep it.
    1.54 +//    |--------------------------------Source code points or range.
    1.55 +//
    1.56 +// The expression will match _all_ lines, including erroneous lines.
    1.57 +// The result of the parse is returned via the contents of the (match) groups.
    1.58 +static const char *parseExp = 
    1.59 +        "(?m)"                                         // Multi-line mode
    1.60 +        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
    1.61 +        "|^(?:"                                        //   OR
    1.62 +        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
    1.63 +        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
    1.64 +        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
    1.65 +        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
    1.66 +        "[ \\t]*(?:#.*?)?"                             // Trailing commment
    1.67 +        ")$|"                                          //   OR
    1.68 +        "^(.*?)$";                                     // An error line.      Group 8.
    1.69 +                                                       //    Any line not matching the preceding
    1.70 +                                                       //    parts of the expression.will match
    1.71 +                                                       //    this, and thus be flagged as an error
    1.72 +
    1.73 +
    1.74 +// Extract a regular expression match group into a char * string.
    1.75 +//    The group must contain only invariant characters.
    1.76 +//    Used for script names
    1.77 +// 
    1.78 +static void extractGroup(
    1.79 +    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
    1.80 +
    1.81 +    UChar ubuf[50];
    1.82 +    ubuf[0] = 0;
    1.83 +    destBuf[0] = 0;
    1.84 +    int32_t len = uregex_group(e, group, ubuf, 50, &status);
    1.85 +    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
    1.86 +        return;
    1.87 +    }
    1.88 +    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
    1.89 +    s.extract(0, len, destBuf, destCapacity, US_INV);
    1.90 +}
    1.91 +
    1.92 +
    1.93 +
    1.94 +U_NAMESPACE_BEGIN
    1.95 +
    1.96 +//  Build the Whole Script Confusable data
    1.97 +//
    1.98 +//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
    1.99 +//                         because everything is local to this one build function anyhow,
   1.100 +//                           OR
   1.101 +//                         break this function into more reasonably sized pieces, with
   1.102 +//                         state in WSConfusableDataBuilder.
   1.103 +//
   1.104 +void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
   1.105 +          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
   1.106 +{
   1.107 +    if (U_FAILURE(status)) {
   1.108 +        return;
   1.109 +    }
   1.110 +    URegularExpression *parseRegexp = NULL;
   1.111 +    int32_t             inputLen    = 0;
   1.112 +    UChar              *input       = NULL;
   1.113 +    int32_t             lineNum     = 0;
   1.114 +    
   1.115 +    UVector            *scriptSets        = NULL;
   1.116 +    uint32_t            rtScriptSetsCount = 2;
   1.117 +
   1.118 +    UTrie2             *anyCaseTrie   = NULL;
   1.119 +    UTrie2             *lowerCaseTrie = NULL;
   1.120 +
   1.121 +    anyCaseTrie = utrie2_open(0, 0, &status);
   1.122 +    lowerCaseTrie = utrie2_open(0, 0, &status);
   1.123 +
   1.124 +    UnicodeString pattern(parseExp, -1, US_INV);
   1.125 +
   1.126 +    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
   1.127 +    //
   1.128 +    // Reserved TRIE values:
   1.129 +    //   0:  Code point has no whole script confusables.
   1.130 +    //   1:  Code point is of script Common or Inherited.
   1.131 +    //       These code points do not participate in whole script confusable detection.
   1.132 +    //       (This is logically equivalent to saying that they contain confusables in
   1.133 +    //        all scripts)
   1.134 +    //
   1.135 +    // Because Trie values are indexes into the ScriptSets vector, pre-fill
   1.136 +    // vector positions 0 and 1 to avoid conflicts with the reserved values.
   1.137 +    
   1.138 +    scriptSets = new UVector(status);
   1.139 +    if (scriptSets == NULL) {
   1.140 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.141 +        goto cleanup;
   1.142 +    }
   1.143 +    scriptSets->addElement((void *)NULL, status);
   1.144 +    scriptSets->addElement((void *)NULL, status);
   1.145 +
   1.146 +    // Convert the user input data from UTF-8 to UChar (UTF-16)
   1.147 +    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
   1.148 +    if (status != U_BUFFER_OVERFLOW_ERROR) {
   1.149 +        goto cleanup;
   1.150 +    }
   1.151 +    status = U_ZERO_ERROR;
   1.152 +    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
   1.153 +    if (input == NULL) {
   1.154 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.155 +        goto cleanup;
   1.156 +    }
   1.157 +    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
   1.158 +
   1.159 +    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
   1.160 +
   1.161 +    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
   1.162 +    //   given the syntax of the input.
   1.163 +    if (*input == 0xfeff) {
   1.164 +        *input = 0x20;
   1.165 +    }
   1.166 +
   1.167 +    // Parse the input, one line per iteration of this loop.
   1.168 +    uregex_setText(parseRegexp, input, inputLen, &status);
   1.169 +    while (uregex_findNext(parseRegexp, &status)) {
   1.170 +        lineNum++;
   1.171 +        if (uregex_start(parseRegexp, 1, &status) >= 0) {
   1.172 +            // this was a blank or comment line.
   1.173 +            continue;
   1.174 +        }
   1.175 +        if (uregex_start(parseRegexp, 8, &status) >= 0) {
   1.176 +            // input file syntax error.
   1.177 +            status = U_PARSE_ERROR;
   1.178 +            goto cleanup;
   1.179 +        }
   1.180 +        if (U_FAILURE(status)) {
   1.181 +            goto cleanup;
   1.182 +        }
   1.183 +
   1.184 +        // Pick up the start and optional range end code points from the parsed line.
   1.185 +        UChar32  startCodePoint = SpoofImpl::ScanHex(
   1.186 +            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
   1.187 +        UChar32  endCodePoint = startCodePoint;
   1.188 +        if (uregex_start(parseRegexp, 3, &status) >=0) {
   1.189 +            endCodePoint = SpoofImpl::ScanHex(
   1.190 +                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
   1.191 +        }
   1.192 +
   1.193 +        // Extract the two script names from the source line.  We need these in an 8 bit
   1.194 +        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
   1.195 +        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
   1.196 +        char  srcScriptName[20];
   1.197 +        char  targScriptName[20];
   1.198 +        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
   1.199 +        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
   1.200 +        UScriptCode srcScript  =
   1.201 +            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
   1.202 +        UScriptCode targScript =
   1.203 +            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
   1.204 +        if (U_FAILURE(status)) {
   1.205 +            goto cleanup;
   1.206 +        }
   1.207 +        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
   1.208 +            status = U_INVALID_FORMAT_ERROR;
   1.209 +            goto cleanup;
   1.210 +        }
   1.211 +
   1.212 +        // select the table - (A) any case or (L) lower case only
   1.213 +        UTrie2 *table = anyCaseTrie;
   1.214 +        if (uregex_start(parseRegexp, 7, &status) >= 0) {
   1.215 +            table = lowerCaseTrie;
   1.216 +        }
   1.217 +
   1.218 +        // Build the set of scripts containing confusable characters for
   1.219 +        //   the code point(s) specified in this input line.
   1.220 +        // Sanity check that the script of the source code point is the same
   1.221 +        //   as the source script indicated in the input file.  Failure of this check is
   1.222 +        //   an error in the input file.
   1.223 +        // Include the source script in the set (needed for Mixed Script Confusable detection).
   1.224 +        //
   1.225 +        UChar32 cp;
   1.226 +        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
   1.227 +            int32_t setIndex = utrie2_get32(table, cp);
   1.228 +            BuilderScriptSet *bsset = NULL;
   1.229 +            if (setIndex > 0) {
   1.230 +                U_ASSERT(setIndex < scriptSets->size());
   1.231 +                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
   1.232 +            } else {
   1.233 +                bsset = new BuilderScriptSet();
   1.234 +                if (bsset == NULL) {
   1.235 +                    status = U_MEMORY_ALLOCATION_ERROR;
   1.236 +                    goto cleanup;
   1.237 +                }
   1.238 +                bsset->codePoint = cp;
   1.239 +                bsset->trie = table;
   1.240 +                bsset->sset = new ScriptSet();
   1.241 +                setIndex = scriptSets->size();
   1.242 +                bsset->index = setIndex;
   1.243 +                bsset->rindex = 0;
   1.244 +                if (bsset->sset == NULL) {
   1.245 +                    status = U_MEMORY_ALLOCATION_ERROR;
   1.246 +                    goto cleanup;
   1.247 +                }
   1.248 +                scriptSets->addElement(bsset, status);
   1.249 +                utrie2_set32(table, cp, setIndex, &status);
   1.250 +            }
   1.251 +            bsset->sset->set(targScript, status);
   1.252 +            bsset->sset->set(srcScript, status);
   1.253 +
   1.254 +            if (U_FAILURE(status)) {
   1.255 +                goto cleanup;
   1.256 +            }
   1.257 +            UScriptCode cpScript = uscript_getScript(cp, &status);
   1.258 +            if (cpScript != srcScript) {
   1.259 +                status = U_INVALID_FORMAT_ERROR;
   1.260 +                goto cleanup;
   1.261 +            }
   1.262 +        }
   1.263 +    }
   1.264 +
   1.265 +    // Eliminate duplicate script sets.  At this point we have a separate
   1.266 +    // script set for every code point that had data in the input file.
   1.267 +    //
   1.268 +    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
   1.269 +    //
   1.270 +    // printf("Number of scriptSets: %d\n", scriptSets->size());
   1.271 +    {
   1.272 +        int32_t duplicateCount = 0;
   1.273 +        rtScriptSetsCount = 2;
   1.274 +        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
   1.275 +            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
   1.276 +            if (outerSet->index != static_cast<uint32_t>(outeri)) {
   1.277 +                // This set was already identified as a duplicate.
   1.278 +                //   It will not be allocated a position in the runtime array of ScriptSets.
   1.279 +                continue;
   1.280 +            }
   1.281 +            outerSet->rindex = rtScriptSetsCount++;
   1.282 +            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
   1.283 +                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
   1.284 +                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
   1.285 +                    delete innerSet->sset;
   1.286 +                    innerSet->scriptSetOwned = FALSE;
   1.287 +                    innerSet->sset = outerSet->sset;
   1.288 +                    innerSet->index = outeri;
   1.289 +                    innerSet->rindex = outerSet->rindex;
   1.290 +                    duplicateCount++;
   1.291 +                }
   1.292 +                // But this doesn't get all.  We need to fix the TRIE.
   1.293 +            }
   1.294 +        }
   1.295 +        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
   1.296 +    }
   1.297 +
   1.298 +    
   1.299 +
   1.300 +    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
   1.301 +    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
   1.302 +    //     are unused, which is why the loop index starts at 2.)
   1.303 +    {
   1.304 +        for (int32_t i=2; i<scriptSets->size(); i++) {
   1.305 +            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   1.306 +            if (bSet->rindex != (uint32_t)i) {
   1.307 +                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
   1.308 +            }
   1.309 +        }
   1.310 +    }
   1.311 +
   1.312 +    // For code points with script==Common or script==Inherited,
   1.313 +    //   Set the reserved value of 1 into both Tries.  These characters do not participate
   1.314 +    //   in Whole Script Confusable detection; this reserved value is the means
   1.315 +    //   by which they are detected.
   1.316 +    {
   1.317 +        UnicodeSet ignoreSet;
   1.318 +        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
   1.319 +        UnicodeSet inheritedSet;
   1.320 +        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
   1.321 +        ignoreSet.addAll(inheritedSet);
   1.322 +        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
   1.323 +            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
   1.324 +            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
   1.325 +            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
   1.326 +            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
   1.327 +        }
   1.328 +    }
   1.329 +
   1.330 +    // Serialize the data to the Spoof Detector
   1.331 +    {
   1.332 +        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
   1.333 +        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
   1.334 +        // printf("Any case Trie size: %d\n", size);
   1.335 +        if (status != U_BUFFER_OVERFLOW_ERROR) {
   1.336 +            goto cleanup;
   1.337 +        }
   1.338 +        status = U_ZERO_ERROR;
   1.339 +        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
   1.340 +        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
   1.341 +        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
   1.342 +        void *where = spImpl->fSpoofData->reserveSpace(size, status);
   1.343 +        utrie2_serialize(anyCaseTrie, where, size, &status);
   1.344 +        
   1.345 +        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
   1.346 +        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
   1.347 +        // printf("Lower case Trie size: %d\n", size);
   1.348 +        if (status != U_BUFFER_OVERFLOW_ERROR) {
   1.349 +            goto cleanup;
   1.350 +        }
   1.351 +        status = U_ZERO_ERROR;
   1.352 +        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
   1.353 +        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
   1.354 +        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
   1.355 +        where = spImpl->fSpoofData->reserveSpace(size, status);
   1.356 +        utrie2_serialize(lowerCaseTrie, where, size, &status);
   1.357 +
   1.358 +        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
   1.359 +        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
   1.360 +        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
   1.361 +            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
   1.362 +        uint32_t rindex = 2;
   1.363 +        for (int32_t i=2; i<scriptSets->size(); i++) {
   1.364 +            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   1.365 +            if (bSet->rindex < rindex) {
   1.366 +                // We have already copied this script set to the serialized data.
   1.367 +                continue;
   1.368 +            }
   1.369 +            U_ASSERT(rindex == bSet->rindex);
   1.370 +            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
   1.371 +            rindex++;
   1.372 +        }
   1.373 +    }
   1.374 +
   1.375 +    // Open new utrie2s from the serialized data.  We don't want to keep the ones
   1.376 +    //   we just built because we would then have two copies of the data, one internal to
   1.377 +    //   the utries that we have already constructed, and one in the serialized data area.
   1.378 +    //   An alternative would be to not pre-serialize the Trie data, but that makes the
   1.379 +    //   spoof detector data different, depending on how the detector was constructed.
   1.380 +    //   It's simpler to keep the data always the same.
   1.381 +    
   1.382 +    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
   1.383 +            UTRIE2_16_VALUE_BITS,
   1.384 +            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
   1.385 +            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
   1.386 +            NULL,
   1.387 +            &status);
   1.388 +
   1.389 +    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
   1.390 +            UTRIE2_16_VALUE_BITS,
   1.391 +            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
   1.392 +            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
   1.393 +            NULL,
   1.394 +            &status);
   1.395 +
   1.396 +    
   1.397 +
   1.398 +cleanup:
   1.399 +    if (U_FAILURE(status)) {
   1.400 +        pe->line = lineNum;
   1.401 +    }
   1.402 +    uregex_close(parseRegexp);
   1.403 +    uprv_free(input);
   1.404 +
   1.405 +    int32_t i;
   1.406 +    if (scriptSets != NULL) {
   1.407 +        for (i=0; i<scriptSets->size(); i++) {
   1.408 +            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
   1.409 +            delete bsset;
   1.410 +        }
   1.411 +        delete scriptSets;
   1.412 +    }
   1.413 +    utrie2_close(anyCaseTrie);
   1.414 +    utrie2_close(lowerCaseTrie);
   1.415 +    return;
   1.416 +}
   1.417 +
   1.418 +U_NAMESPACE_END
   1.419 +
   1.420 +
   1.421 +
   1.422 +BuilderScriptSet::BuilderScriptSet() {
   1.423 +    codePoint = -1;
   1.424 +    trie = NULL;
   1.425 +    sset = NULL;
   1.426 +    index = 0;
   1.427 +    rindex = 0;
   1.428 +    scriptSetOwned = TRUE;
   1.429 +}
   1.430 +
   1.431 +BuilderScriptSet::~BuilderScriptSet() {
   1.432 +    if (scriptSetOwned) {
   1.433 +        delete sset;
   1.434 +    }
   1.435 +}
   1.436 +
   1.437 +#endif
   1.438 +#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS 
   1.439 +

mercurial