intl/icu/source/i18n/uspoof_conf.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/uspoof_conf.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,605 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2008-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  uspoof_conf.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009Jan05  (refactoring earlier files)
    1.17 +*   created by: Andy Heninger
    1.18 +*
    1.19 +*   Internal classes for compililing confusable data into its binary (runtime) form.
    1.20 +*/
    1.21 +
    1.22 +#include "unicode/utypes.h"
    1.23 +#include "unicode/uspoof.h"
    1.24 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.25 +#if !UCONFIG_NO_NORMALIZATION
    1.26 +
    1.27 +#include "unicode/unorm.h"
    1.28 +#include "unicode/uregex.h"
    1.29 +#include "unicode/ustring.h"
    1.30 +#include "cmemory.h"
    1.31 +#include "uspoof_impl.h"
    1.32 +#include "uhash.h"
    1.33 +#include "uvector.h"
    1.34 +#include "uassert.h"
    1.35 +#include "uarrsort.h"
    1.36 +#include "uspoof_conf.h"
    1.37 +
    1.38 +U_NAMESPACE_USE
    1.39 +
    1.40 +
    1.41 +//---------------------------------------------------------------------
    1.42 +//
    1.43 +//  buildConfusableData   Compile the source confusable data, as defined by
    1.44 +//                        the Unicode data file confusables.txt, into the binary
    1.45 +//                        structures used by the confusable detector.
    1.46 +//
    1.47 +//                        The binary structures are described in uspoof_impl.h
    1.48 +//
    1.49 +//     1.  parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
    1.50 +//         tables.  Each maps from a UChar32 to a String.
    1.51 +//
    1.52 +//     2.  Sort all of the strings encountered by length, since they will need to
    1.53 +//         be stored in that order in the final string table.
    1.54 +//
    1.55 +//     3.  Build a list of keys (UChar32s) from the four mapping tables.  Sort the
    1.56 +//         list because that will be the ordering of our runtime table.
    1.57 +//
    1.58 +//     4.  Generate the run time string table.  This is generated before the key & value
    1.59 +//         tables because we need the string indexes when building those tables.
    1.60 +//
    1.61 +//     5.  Build the run-time key and value tables.  These are parallel tables, and are built
    1.62 +//         at the same time
    1.63 +//
    1.64 +
    1.65 +SPUString::SPUString(UnicodeString *s) {
    1.66 +    fStr = s;
    1.67 +    fStrTableIndex = 0;
    1.68 +}
    1.69 +
    1.70 +
    1.71 +SPUString::~SPUString() {
    1.72 +    delete fStr;
    1.73 +}
    1.74 +
    1.75 +
    1.76 +SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
    1.77 +    fVec = new UVector(status);
    1.78 +    fHash = uhash_open(uhash_hashUnicodeString,           // key hash function
    1.79 +                       uhash_compareUnicodeString,        // Key Comparator
    1.80 +                       NULL,                              // Value Comparator
    1.81 +                       &status);
    1.82 +}
    1.83 +
    1.84 +
    1.85 +SPUStringPool::~SPUStringPool() {
    1.86 +    int i;
    1.87 +    for (i=fVec->size()-1; i>=0; i--) {
    1.88 +        SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
    1.89 +        delete s;
    1.90 +    }
    1.91 +    delete fVec;
    1.92 +    uhash_close(fHash);
    1.93 +}
    1.94 +
    1.95 +
    1.96 +int32_t SPUStringPool::size() {
    1.97 +    return fVec->size();
    1.98 +}
    1.99 +
   1.100 +SPUString *SPUStringPool::getByIndex(int32_t index) {
   1.101 +    SPUString *retString = (SPUString *)fVec->elementAt(index);
   1.102 +    return retString;
   1.103 +}
   1.104 +
   1.105 +
   1.106 +// Comparison function for ordering strings in the string pool.
   1.107 +// Compare by length first, then, within a group of the same length,
   1.108 +// by code point order.
   1.109 +// Conforms to the type signature for a USortComparator in uvector.h
   1.110 +
   1.111 +static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
   1.112 +	const SPUString *sL = const_cast<const SPUString *>(
   1.113 +        static_cast<SPUString *>(left.pointer));
   1.114 + 	const SPUString *sR = const_cast<const SPUString *>(
   1.115 + 	    static_cast<SPUString *>(right.pointer));
   1.116 +    int32_t lenL = sL->fStr->length();
   1.117 +    int32_t lenR = sR->fStr->length();
   1.118 +    if (lenL < lenR) {
   1.119 +        return -1;
   1.120 +    } else if (lenL > lenR) {
   1.121 +        return 1;
   1.122 +    } else {
   1.123 +        return sL->fStr->compare(*(sR->fStr));
   1.124 +    }
   1.125 +}
   1.126 +
   1.127 +void SPUStringPool::sort(UErrorCode &status) {
   1.128 +    fVec->sort(SPUStringCompare, status);
   1.129 +}
   1.130 +
   1.131 +
   1.132 +SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
   1.133 +    SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
   1.134 +    if (hashedString != NULL) {
   1.135 +        delete src;
   1.136 +    } else {
   1.137 +        hashedString = new SPUString(src);
   1.138 +        uhash_put(fHash, src, hashedString, &status);
   1.139 +        fVec->addElement(hashedString, status);
   1.140 +    }
   1.141 +    return hashedString;
   1.142 +}
   1.143 +
   1.144 +
   1.145 +
   1.146 +ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
   1.147 +    fSpoofImpl(spImpl),
   1.148 +    fInput(NULL),
   1.149 +    fSLTable(NULL),
   1.150 +    fSATable(NULL),
   1.151 +    fMLTable(NULL),
   1.152 +    fMATable(NULL),
   1.153 +    fKeySet(NULL),
   1.154 +    fKeyVec(NULL),
   1.155 +    fValueVec(NULL),
   1.156 +    fStringTable(NULL),
   1.157 +    fStringLengthsTable(NULL),
   1.158 +    stringPool(NULL),
   1.159 +    fParseLine(NULL),
   1.160 +    fParseHexNum(NULL),
   1.161 +    fLineNum(0)
   1.162 +{
   1.163 +    if (U_FAILURE(status)) {
   1.164 +        return;
   1.165 +    }
   1.166 +    fSLTable    = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
   1.167 +    fSATable    = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
   1.168 +    fMLTable    = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
   1.169 +    fMATable    = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
   1.170 +    fKeySet     = new UnicodeSet();
   1.171 +    fKeyVec     = new UVector(status);
   1.172 +    fValueVec   = new UVector(status);
   1.173 +    stringPool = new SPUStringPool(status);
   1.174 +}
   1.175 +
   1.176 +
   1.177 +ConfusabledataBuilder::~ConfusabledataBuilder() {
   1.178 +    uprv_free(fInput);
   1.179 +    uregex_close(fParseLine);
   1.180 +    uregex_close(fParseHexNum);
   1.181 +    uhash_close(fSLTable);
   1.182 +    uhash_close(fSATable);
   1.183 +    uhash_close(fMLTable);
   1.184 +    uhash_close(fMATable);
   1.185 +    delete fKeySet;
   1.186 +    delete fKeyVec;
   1.187 +    delete fStringTable;
   1.188 +    delete fStringLengthsTable;
   1.189 +    delete fValueVec;
   1.190 +    delete stringPool;
   1.191 +}
   1.192 +
   1.193 +
   1.194 +void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
   1.195 +    int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
   1.196 +
   1.197 +    if (U_FAILURE(status)) {
   1.198 +        return;
   1.199 +    }
   1.200 +    ConfusabledataBuilder builder(spImpl, status);
   1.201 +    builder.build(confusables, confusablesLen, status);
   1.202 +    if (U_FAILURE(status) && errorType != NULL) {
   1.203 +        *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
   1.204 +        pe->line = builder.fLineNum;
   1.205 +    }
   1.206 +}
   1.207 +
   1.208 +
   1.209 +void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
   1.210 +               UErrorCode &status) {
   1.211 +
   1.212 +    // Convert the user input data from UTF-8 to UChar (UTF-16)
   1.213 +    int32_t inputLen = 0;
   1.214 +    if (U_FAILURE(status)) {
   1.215 +        return;
   1.216 +    }
   1.217 +    u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
   1.218 +    if (status != U_BUFFER_OVERFLOW_ERROR) {
   1.219 +        return;
   1.220 +    }
   1.221 +    status = U_ZERO_ERROR;
   1.222 +    fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
   1.223 +    if (fInput == NULL) {
   1.224 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.225 +        return;
   1.226 +    }
   1.227 +    u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
   1.228 +
   1.229 +
   1.230 +    // Regular Expression to parse a line from Confusables.txt.  The expression will match
   1.231 +    // any line.  What was matched is determined by examining which capture groups have a match.
   1.232 +    //   Capture Group 1:  the source char
   1.233 +    //   Capture Group 2:  the replacement chars
   1.234 +    //   Capture Group 3-6  the table type, SL, SA, ML, or MA
   1.235 +    //   Capture Group 7:  A blank or comment only line.
   1.236 +    //   Capture Group 8:  A syntactically invalid line.  Anything that didn't match before.
   1.237 +    // Example Line from the confusables.txt source file:
   1.238 +    //   "1D702 ;	006E 0329 ;	SL	# MATHEMATICAL ITALIC SMALL ETA ... "
   1.239 +    UnicodeString pattern(
   1.240 +        "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;"      // Match the source char
   1.241 +        "[ \\t]*([0-9A-Fa-f]+"                    // Match the replacement char(s)
   1.242 +           "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;"    //     (continued)
   1.243 +        "\\s*(?:(SL)|(SA)|(ML)|(MA))"             // Match the table type
   1.244 +        "[ \\t]*(?:#.*?)?$"                       // Match any trailing #comment
   1.245 +        "|^([ \\t]*(?:#.*?)?)$"       // OR match empty lines or lines with only a #comment
   1.246 +        "|^(.*?)$", -1, US_INV);      // OR match any line, which catches illegal lines.
   1.247 +    // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
   1.248 +    fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
   1.249 +
   1.250 +    // Regular expression for parsing a hex number out of a space-separated list of them.
   1.251 +    //   Capture group 1 gets the number, with spaces removed.
   1.252 +    pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
   1.253 +    fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
   1.254 +
   1.255 +    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
   1.256 +    //   given the syntax of the input.
   1.257 +    if (*fInput == 0xfeff) {
   1.258 +        *fInput = 0x20;
   1.259 +    }
   1.260 +
   1.261 +    // Parse the input, one line per iteration of this loop.
   1.262 +    uregex_setText(fParseLine, fInput, inputLen, &status);
   1.263 +    while (uregex_findNext(fParseLine, &status)) {
   1.264 +        fLineNum++;
   1.265 +        if (uregex_start(fParseLine, 7, &status) >= 0) {
   1.266 +            // this was a blank or comment line.
   1.267 +            continue;
   1.268 +        }
   1.269 +        if (uregex_start(fParseLine, 8, &status) >= 0) {
   1.270 +            // input file syntax error.
   1.271 +            status = U_PARSE_ERROR;
   1.272 +            return;
   1.273 +        }
   1.274 +
   1.275 +        // We have a good input line.  Extract the key character and mapping string, and
   1.276 +        //    put them into the appropriate mapping table.
   1.277 +        UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
   1.278 +                          uregex_end(fParseLine, 1, &status), status);
   1.279 +
   1.280 +        int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
   1.281 +        int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
   1.282 +        uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
   1.283 +
   1.284 +        UnicodeString  *mapString = new UnicodeString();
   1.285 +        if (mapString == NULL) {
   1.286 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.287 +            return;
   1.288 +        }
   1.289 +        while (uregex_findNext(fParseHexNum, &status)) {
   1.290 +            UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
   1.291 +                                 uregex_end(fParseHexNum, 1, &status), status);
   1.292 +            mapString->append(c);
   1.293 +        }
   1.294 +        U_ASSERT(mapString->length() >= 1);
   1.295 +
   1.296 +        // Put the map (value) string into the string pool
   1.297 +        // This a little like a Java intern() - any duplicates will be eliminated.
   1.298 +        SPUString *smapString = stringPool->addString(mapString, status);
   1.299 +
   1.300 +        // Add the UChar32 -> string mapping to the appropriate table.
   1.301 +        UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
   1.302 +                            uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
   1.303 +                            uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
   1.304 +                            uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
   1.305 +                            NULL;
   1.306 +        U_ASSERT(table != NULL);
   1.307 +        uhash_iput(table, keyChar, smapString, &status);
   1.308 +        fKeySet->add(keyChar);
   1.309 +        if (U_FAILURE(status)) {
   1.310 +            return;
   1.311 +        }
   1.312 +    }
   1.313 +
   1.314 +    // Input data is now all parsed and collected.
   1.315 +    // Now create the run-time binary form of the data.
   1.316 +    //
   1.317 +    // This is done in two steps.  First the data is assembled into vectors and strings,
   1.318 +    //   for ease of construction, then the contents of these collections are dumped
   1.319 +    //   into the actual raw-bytes data storage.
   1.320 +
   1.321 +    // Build up the string array, and record the index of each string therein
   1.322 +    //  in the (build time only) string pool.
   1.323 +    // Strings of length one are not entered into the strings array.
   1.324 +    // At the same time, build up the string lengths table, which records the
   1.325 +    // position in the string table of the first string of each length >= 4.
   1.326 +    // (Strings in the table are sorted by length)
   1.327 +    stringPool->sort(status);
   1.328 +    fStringTable = new UnicodeString();
   1.329 +    fStringLengthsTable = new UVector(status);
   1.330 +    int32_t previousStringLength = 0;
   1.331 +    int32_t previousStringIndex  = 0;
   1.332 +    int32_t poolSize = stringPool->size();
   1.333 +    int32_t i;
   1.334 +    for (i=0; i<poolSize; i++) {
   1.335 +        SPUString *s = stringPool->getByIndex(i);
   1.336 +        int32_t strLen = s->fStr->length();
   1.337 +        int32_t strIndex = fStringTable->length();
   1.338 +        U_ASSERT(strLen >= previousStringLength);
   1.339 +        if (strLen == 1) {
   1.340 +            // strings of length one do not get an entry in the string table.
   1.341 +            // Keep the single string character itself here, which is the same
   1.342 +            //  convention that is used in the final run-time string table index.
   1.343 +            s->fStrTableIndex = s->fStr->charAt(0);
   1.344 +        } else {
   1.345 +            if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
   1.346 +                fStringLengthsTable->addElement(previousStringIndex, status);
   1.347 +                fStringLengthsTable->addElement(previousStringLength, status);
   1.348 +            }
   1.349 +            s->fStrTableIndex = strIndex;
   1.350 +            fStringTable->append(*(s->fStr));
   1.351 +        }
   1.352 +        previousStringLength = strLen;
   1.353 +        previousStringIndex  = strIndex;
   1.354 +    }
   1.355 +    // Make the final entry to the string lengths table.
   1.356 +    //   (it holds an entry for the _last_ string of each length, so adding the
   1.357 +    //    final one doesn't happen in the main loop because no longer string was encountered.)
   1.358 +    if (previousStringLength >= 4) {
   1.359 +        fStringLengthsTable->addElement(previousStringIndex, status);
   1.360 +        fStringLengthsTable->addElement(previousStringLength, status);
   1.361 +    }
   1.362 +
   1.363 +    // Construct the compile-time Key and Value tables
   1.364 +    //
   1.365 +    // For each key code point, check which mapping tables it applies to,
   1.366 +    //   and create the final data for the key & value structures.
   1.367 +    //
   1.368 +    //   The four logical mapping tables are conflated into one combined table.
   1.369 +    //   If multiple logical tables have the same mapping for some key, they
   1.370 +    //     share a single entry in the combined table.
   1.371 +    //   If more than one mapping exists for the same key code point, multiple
   1.372 +    //     entries will be created in the table
   1.373 +
   1.374 +    for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
   1.375 +        // It is an oddity of the UnicodeSet API that simply enumerating the contained
   1.376 +        //   code points requires a nested loop.
   1.377 +        for (UChar32 keyChar=fKeySet->getRangeStart(range);
   1.378 +                keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
   1.379 +            addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
   1.380 +            addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
   1.381 +            addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
   1.382 +            addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
   1.383 +        }
   1.384 +    }
   1.385 +
   1.386 +    // Put the assembled data into the flat runtime array
   1.387 +    outputData(status);
   1.388 +
   1.389 +    // All of the intermediate allocated data belongs to the ConfusabledataBuilder
   1.390 +    //  object  (this), and is deleted in the destructor.
   1.391 +    return;
   1.392 +}
   1.393 +
   1.394 +//
   1.395 +// outputData     The confusable data has been compiled and stored in intermediate
   1.396 +//                collections and strings.  Copy it from there to the final flat
   1.397 +//                binary array.
   1.398 +//
   1.399 +//                Note that as each section is added to the output data, the
   1.400 +//                expand (reserveSpace() function will likely relocate it in memory.
   1.401 +//                Be careful with pointers.
   1.402 +//
   1.403 +void ConfusabledataBuilder::outputData(UErrorCode &status) {
   1.404 +
   1.405 +    U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
   1.406 +
   1.407 +    //  The Key Table
   1.408 +    //     While copying the keys to the runtime array,
   1.409 +    //       also sanity check that they are sorted.
   1.410 +
   1.411 +    int32_t numKeys = fKeyVec->size();
   1.412 +    int32_t *keys =
   1.413 +        static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
   1.414 +    if (U_FAILURE(status)) {
   1.415 +        return;
   1.416 +    }
   1.417 +    int i;
   1.418 +    int32_t previousKey = 0;
   1.419 +    for (i=0; i<numKeys; i++) {
   1.420 +        int32_t key =  fKeyVec->elementAti(i);
   1.421 +        (void)previousKey;         // Suppress unused variable warning on gcc.
   1.422 +        U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
   1.423 +        U_ASSERT((key & 0xff000000) != 0);
   1.424 +        keys[i] = key;
   1.425 +        previousKey = key;
   1.426 +    }
   1.427 +    SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
   1.428 +    rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
   1.429 +    rawData->fCFUKeysSize = numKeys;
   1.430 +    fSpoofImpl->fSpoofData->fCFUKeys = keys;
   1.431 +
   1.432 +
   1.433 +    // The Value Table, parallels the key table
   1.434 +    int32_t numValues = fValueVec->size();
   1.435 +    U_ASSERT(numKeys == numValues);
   1.436 +    uint16_t *values =
   1.437 +        static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
   1.438 +    if (U_FAILURE(status)) {
   1.439 +        return;
   1.440 +    }
   1.441 +    for (i=0; i<numValues; i++) {
   1.442 +        uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
   1.443 +        U_ASSERT(value < 0xffff);
   1.444 +        values[i] = static_cast<uint16_t>(value);
   1.445 +    }
   1.446 +    rawData = fSpoofImpl->fSpoofData->fRawData;
   1.447 +    rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
   1.448 +    rawData->fCFUStringIndexSize = numValues;
   1.449 +    fSpoofImpl->fSpoofData->fCFUValues = values;
   1.450 +
   1.451 +    // The Strings Table.
   1.452 +
   1.453 +    uint32_t stringsLength = fStringTable->length();
   1.454 +    // Reserve an extra space so the string will be nul-terminated.  This is
   1.455 +    // only a convenience, for when debugging; it is not needed otherwise.
   1.456 +    UChar *strings =
   1.457 +        static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
   1.458 +    if (U_FAILURE(status)) {
   1.459 +        return;
   1.460 +    }
   1.461 +    fStringTable->extract(strings, stringsLength+1, status);
   1.462 +    rawData = fSpoofImpl->fSpoofData->fRawData;
   1.463 +    U_ASSERT(rawData->fCFUStringTable == 0);
   1.464 +    rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
   1.465 +    rawData->fCFUStringTableLen = stringsLength;
   1.466 +    fSpoofImpl->fSpoofData->fCFUStrings = strings;
   1.467 +
   1.468 +    // The String Lengths Table
   1.469 +    //    While copying into the runtime array do some sanity checks on the values
   1.470 +    //    Each complete entry contains two fields, an index and an offset.
   1.471 +    //    Lengths should increase with each entry.
   1.472 +    //    Offsets should be less than the size of the string table.
   1.473 +    int32_t lengthTableLength = fStringLengthsTable->size();
   1.474 +    uint16_t *stringLengths =
   1.475 +        static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
   1.476 +    if (U_FAILURE(status)) {
   1.477 +        return;
   1.478 +    }
   1.479 +    int32_t destIndex = 0;
   1.480 +    uint32_t previousLength = 0;
   1.481 +    for (i=0; i<lengthTableLength; i+=2) {
   1.482 +        uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
   1.483 +        uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
   1.484 +        U_ASSERT(offset < stringsLength);
   1.485 +        U_ASSERT(length < 40);
   1.486 +        (void)previousLength;  // Suppress unused variable warning on gcc.
   1.487 +        U_ASSERT(length > previousLength);
   1.488 +        stringLengths[destIndex++] = static_cast<uint16_t>(offset);
   1.489 +        stringLengths[destIndex++] = static_cast<uint16_t>(length);
   1.490 +        previousLength = length;
   1.491 +    }
   1.492 +    rawData = fSpoofImpl->fSpoofData->fRawData;
   1.493 +    rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
   1.494 +    // Note: StringLengthsSize in the raw data is the number of complete entries,
   1.495 +    //       each consisting of a pair of 16 bit values, hence the divide by 2.
   1.496 +    rawData->fCFUStringLengthsSize = lengthTableLength / 2;
   1.497 +    fSpoofImpl->fSpoofData->fCFUStringLengths =
   1.498 +        reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
   1.499 +}
   1.500 +
   1.501 +
   1.502 +
   1.503 +//  addKeyEntry   Construction of the confusable Key and Mapping Values tables.
   1.504 +//                This is an intermediate point in the building process.
   1.505 +//                We already have the mappings in the hash tables fSLTable, etc.
   1.506 +//                This function builds corresponding run-time style table entries into
   1.507 +//                  fKeyVec and fValueVec
   1.508 +
   1.509 +void ConfusabledataBuilder::addKeyEntry(
   1.510 +    UChar32     keyChar,     // The key character
   1.511 +    UHashtable *table,       // The table, one of SATable, MATable, etc.
   1.512 +    int32_t     tableFlag,   // One of USPOOF_SA_TABLE_FLAG, etc.
   1.513 +    UErrorCode &status) {
   1.514 +
   1.515 +    SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
   1.516 +    if (targetMapping == NULL) {
   1.517 +        // No mapping for this key character.
   1.518 +        //   (This function is called for all four tables for each key char that
   1.519 +        //    is seen anywhere, so this no entry cases are very much expected.)
   1.520 +        return;
   1.521 +    }
   1.522 +
   1.523 +    // Check whether there is already an entry with the correct mapping.
   1.524 +    // If so, simply set the flag in the keyTable saying that the existing entry
   1.525 +    // applies to the table that we're doing now.
   1.526 +
   1.527 +    UBool keyHasMultipleValues = FALSE;
   1.528 +    int32_t i;
   1.529 +    for (i=fKeyVec->size()-1; i>=0 ; i--) {
   1.530 +        int32_t key = fKeyVec->elementAti(i);
   1.531 +        if ((key & 0x0ffffff) != keyChar) {
   1.532 +            // We have now checked all existing key entries for this key char (if any)
   1.533 +            //  without finding one with the same mapping.
   1.534 +            break;
   1.535 +        }
   1.536 +        UnicodeString mapping = getMapping(i);
   1.537 +        if (mapping == *(targetMapping->fStr)) {
   1.538 +            // The run time entry we are currently testing has the correct mapping.
   1.539 +            // Set the flag in it indicating that it applies to the new table also.
   1.540 +            key |= tableFlag;
   1.541 +            fKeyVec->setElementAt(key, i);
   1.542 +            return;
   1.543 +        }
   1.544 +        keyHasMultipleValues = TRUE;
   1.545 +    }
   1.546 +
   1.547 +    // Need to add a new entry to the binary data being built for this mapping.
   1.548 +    // Includes adding entries to both the key table and the parallel values table.
   1.549 +
   1.550 +    int32_t newKey = keyChar | tableFlag;
   1.551 +    if (keyHasMultipleValues) {
   1.552 +        newKey |= USPOOF_KEY_MULTIPLE_VALUES;
   1.553 +    }
   1.554 +    int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
   1.555 +    if (adjustedMappingLength>3) {
   1.556 +        adjustedMappingLength = 3;
   1.557 +    }
   1.558 +    newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
   1.559 +
   1.560 +    int32_t newData = targetMapping->fStrTableIndex;
   1.561 +
   1.562 +    fKeyVec->addElement(newKey, status);
   1.563 +    fValueVec->addElement(newData, status);
   1.564 +
   1.565 +    // If the preceding key entry is for the same key character (but with a different mapping)
   1.566 +    //   set the multiple-values flag on it.
   1.567 +    if (keyHasMultipleValues) {
   1.568 +        int32_t previousKeyIndex = fKeyVec->size() - 2;
   1.569 +        int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
   1.570 +        previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
   1.571 +        fKeyVec->setElementAt(previousKey, previousKeyIndex);
   1.572 +    }
   1.573 +}
   1.574 +
   1.575 +
   1.576 +
   1.577 +UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
   1.578 +    int32_t key = fKeyVec->elementAti(index);
   1.579 +    int32_t value = fValueVec->elementAti(index);
   1.580 +    int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
   1.581 +    int32_t lastIndexWithLen;
   1.582 +    switch (length) {
   1.583 +      case 0:
   1.584 +        return UnicodeString(static_cast<UChar>(value));
   1.585 +      case 1:
   1.586 +      case 2:
   1.587 +        return UnicodeString(*fStringTable, value, length+1);
   1.588 +      case 3:
   1.589 +        length = 0;
   1.590 +        int32_t i;
   1.591 +        for (i=0; i<fStringLengthsTable->size(); i+=2) {
   1.592 +            lastIndexWithLen = fStringLengthsTable->elementAti(i);
   1.593 +            if (value <= lastIndexWithLen) {
   1.594 +                length = fStringLengthsTable->elementAti(i+1);
   1.595 +                break;
   1.596 +            }
   1.597 +        }
   1.598 +        U_ASSERT(length>=3);
   1.599 +        return UnicodeString(*fStringTable, value, length);
   1.600 +      default:
   1.601 +        U_ASSERT(FALSE);
   1.602 +    }
   1.603 +    return UnicodeString();
   1.604 +}
   1.605 +
   1.606 +#endif
   1.607 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1.608 +

mercurial