The Tor Browser: diff intl/icu/source/i18n/ucol

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/ucol_tok.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2446 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2001-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucol_tok.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created 02/22/2001
    1.17 +*   created by: Vladimir Weinstein
    1.18 +*
    1.19 +* This module reads a tailoring rule string and produces a list of
    1.20 +* tokens that will be turned into collation elements
    1.21 +*
    1.22 +*/
    1.23 +
    1.24 +#include "unicode/utypes.h"
    1.25 +
    1.26 +#if !UCONFIG_NO_COLLATION
    1.27 +
    1.28 +#include "unicode/uscript.h"
    1.29 +#include "unicode/ustring.h"
    1.30 +#include "unicode/uchar.h"
    1.31 +#include "unicode/uniset.h"
    1.32 +
    1.33 +#include "cmemory.h"
    1.34 +#include "cstring.h"
    1.35 +#include "patternprops.h"
    1.36 +#include "ucol_bld.h"
    1.37 +#include "ucol_tok.h"
    1.38 +#include "ulocimp.h"
    1.39 +#include "uresimp.h"
    1.40 +
    1.41 +// Define this only for debugging.
    1.42 +// #define DEBUG_FOR_COLL_RULES 1
    1.43 +
    1.44 +#ifdef DEBUG_FOR_COLL_RULES
    1.45 +#include <iostream>
    1.46 +#endif
    1.47 +
    1.48 +U_NAMESPACE_USE
    1.49 +
    1.50 +U_CDECL_BEGIN
    1.51 +static int32_t U_CALLCONV
    1.52 +uhash_hashTokens(const UHashTok k)
    1.53 +{
    1.54 +    int32_t hash = 0;
    1.55 +    //uint32_t key = (uint32_t)k.integer;
    1.56 +    UColToken *key = (UColToken *)k.pointer;
    1.57 +    if (key != 0) {
    1.58 +        int32_t len = (key->source & 0xFF000000)>>24;
    1.59 +        int32_t inc = ((len - 32) / 32) + 1;
    1.60 +
    1.61 +        const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
    1.62 +        const UChar *limit = p + len;
    1.63 +
    1.64 +        while (p<limit) {
    1.65 +            hash = (hash * 37) + *p;
    1.66 +            p += inc;
    1.67 +        }
    1.68 +    }
    1.69 +    return hash;
    1.70 +}
    1.71 +
    1.72 +static UBool U_CALLCONV
    1.73 +uhash_compareTokens(const UHashTok key1, const UHashTok key2)
    1.74 +{
    1.75 +    //uint32_t p1 = (uint32_t) key1.integer;
    1.76 +    //uint32_t p2 = (uint32_t) key2.integer;
    1.77 +    UColToken *p1 = (UColToken *)key1.pointer;
    1.78 +    UColToken *p2 = (UColToken *)key2.pointer;
    1.79 +    const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
    1.80 +    const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
    1.81 +    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
    1.82 +    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
    1.83 +    const UChar *end = s1+s1L-1;
    1.84 +
    1.85 +    if (p1 == p2) {
    1.86 +        return TRUE;
    1.87 +    }
    1.88 +    if (p1->source == 0 || p2->source == 0) {
    1.89 +        return FALSE;
    1.90 +    }
    1.91 +    if(s1L != s2L) {
    1.92 +        return FALSE;
    1.93 +    }
    1.94 +    if(p1->source == p2->source) {
    1.95 +        return TRUE;
    1.96 +    }
    1.97 +    while((s1 < end) && *s1 == *s2) {
    1.98 +        ++s1;
    1.99 +        ++s2;
   1.100 +    }
   1.101 +    if(*s1 == *s2) {
   1.102 +        return TRUE;
   1.103 +    } else {
   1.104 +        return FALSE;
   1.105 +    }
   1.106 +}
   1.107 +U_CDECL_END
   1.108 +
   1.109 +/*
   1.110 + * Debug messages used to pinpoint where a format error occurred.
   1.111 + * A better way is to include context-sensitive information in syntaxError() function.
   1.112 + *
   1.113 + * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
   1.114 + * in the compile line.
   1.115 + */
   1.116 +/* #define DEBUG_FOR_FORMAT_ERROR 1 */
   1.117 +
   1.118 +#ifdef DEBUG_FOR_FORMAT_ERROR
   1.119 +#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
   1.120 +#else
   1.121 +#define DBG_FORMAT_ERROR
   1.122 +#endif
   1.123 +
   1.124 +
   1.125 +/*
   1.126 + * Controls debug messages so that the output can be compared before and after a
   1.127 + * big change.  Prints the information of every code point that comes out of the
   1.128 + * collation parser and its strength into a file.  When a big change in format
   1.129 + * happens, the files before and after the change should be identical.
   1.130 + *
   1.131 + * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
   1.132 + * in the compile line.
   1.133 + */
   1.134 +// #define DEBUG_FOR_CODE_POINTS 1
   1.135 +
   1.136 +#ifdef DEBUG_FOR_CODE_POINTS
   1.137 +    FILE* dfcp_fp = NULL;
   1.138 +#endif
   1.139 +
   1.140 +
   1.141 +typedef struct {
   1.142 +    uint32_t startCE;
   1.143 +    uint32_t startContCE;
   1.144 +    uint32_t limitCE;
   1.145 +    uint32_t limitContCE;
   1.146 +} indirectBoundaries;
   1.147 +
   1.148 +/* these values are used for finding CE values for indirect positioning. */
   1.149 +/* Indirect positioning is a mechanism for allowing resets on symbolic   */
   1.150 +/* values. It only works for resets and you cannot tailor indirect names */
   1.151 +/* An indirect name can define either an anchor point or a range. An     */
   1.152 +/* anchor point behaves in exactly the same way as a code point in reset */
   1.153 +/* would, except that it cannot be tailored. A range (we currently only  */
   1.154 +/* know for the [top] range will explicitly set the upper bound for      */
   1.155 +/* generated CEs, thus allowing for better control over how many CEs can */
   1.156 +/* be squeezed between in the range without performance penalty.         */
   1.157 +/* In that respect, we use [top] for tailoring of locales that use CJK   */
   1.158 +/* characters. Other indirect values are currently a pure convenience,   */
   1.159 +/* they can be used to assure that the CEs will be always positioned in  */
   1.160 +/* the same place relative to a point with known properties (e.g. first  */
   1.161 +/* primary ignorable). */
   1.162 +static indirectBoundaries ucolIndirectBoundaries[15];
   1.163 +/*
   1.164 +static indirectBoundaries ucolIndirectBoundaries[11] = {
   1.165 +{ UCOL_RESET_TOP_VALUE,               0,
   1.166 +UCOL_NEXT_TOP_VALUE,                0 },
   1.167 +{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
   1.168 +0,                                  0 },
   1.169 +{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
   1.170 +0,                                  0 },
   1.171 +{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
   1.172 +0,                                  0 },
   1.173 +{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
   1.174 +0,                                  0 },
   1.175 +{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
   1.176 +0,                                  0 },
   1.177 +{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
   1.178 +0,                                  0 },
   1.179 +{ UCOL_FIRST_VARIABLE,                0,
   1.180 +0,                                  0 },
   1.181 +{ UCOL_LAST_VARIABLE,                 0,
   1.182 +0,                                  0 },
   1.183 +{ UCOL_FIRST_NON_VARIABLE,            0,
   1.184 +0,                                  0 },
   1.185 +{ UCOL_LAST_NON_VARIABLE,             0,
   1.186 +0,                                  0 },
   1.187 +};
   1.188 +*/
   1.189 +
   1.190 +static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
   1.191 +
   1.192 +    // Set values for the top - TODO: once we have values for all the indirects, we are going
   1.193 +    // to initalize here.
   1.194 +    ucolIndirectBoundaries[indexR].startCE = start[0];
   1.195 +    ucolIndirectBoundaries[indexR].startContCE = start[1];
   1.196 +    if(end) {
   1.197 +        ucolIndirectBoundaries[indexR].limitCE = end[0];
   1.198 +        ucolIndirectBoundaries[indexR].limitContCE = end[1];
   1.199 +    } else {
   1.200 +        ucolIndirectBoundaries[indexR].limitCE = 0;
   1.201 +        ucolIndirectBoundaries[indexR].limitContCE = 0;
   1.202 +    }
   1.203 +}
   1.204 +
   1.205 +
   1.206 +static inline
   1.207 +void syntaxError(const UChar* rules,
   1.208 +                 int32_t pos,
   1.209 +                 int32_t rulesLen,
   1.210 +                 UParseError* parseError)
   1.211 +{
   1.212 +    parseError->offset = pos;
   1.213 +    parseError->line = 0 ; /* we are not using line numbers */
   1.214 +
   1.215 +    // for pre-context
   1.216 +    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
   1.217 +    int32_t stop  = pos;
   1.218 +
   1.219 +    u_memcpy(parseError->preContext,rules+start,stop-start);
   1.220 +    //null terminate the buffer
   1.221 +    parseError->preContext[stop-start] = 0;
   1.222 +
   1.223 +    //for post-context
   1.224 +    start = pos+1;
   1.225 +    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
   1.226 +    rulesLen;
   1.227 +
   1.228 +    if(start < stop) {
   1.229 +        u_memcpy(parseError->postContext,rules+start,stop-start);
   1.230 +        //null terminate the buffer
   1.231 +        parseError->postContext[stop-start]= 0;
   1.232 +    } else {
   1.233 +        parseError->postContext[0] = 0;
   1.234 +    }
   1.235 +}
   1.236 +
   1.237 +static
   1.238 +void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
   1.239 +    switch(attrib) {
   1.240 +    case UCOL_HIRAGANA_QUATERNARY_MODE:
   1.241 +        opts->hiraganaQ = value;
   1.242 +        break;
   1.243 +    case UCOL_FRENCH_COLLATION:
   1.244 +        opts->frenchCollation = value;
   1.245 +        break;
   1.246 +    case UCOL_ALTERNATE_HANDLING:
   1.247 +        opts->alternateHandling = value;
   1.248 +        break;
   1.249 +    case UCOL_CASE_FIRST:
   1.250 +        opts->caseFirst = value;
   1.251 +        break;
   1.252 +    case UCOL_CASE_LEVEL:
   1.253 +        opts->caseLevel = value;
   1.254 +        break;
   1.255 +    case UCOL_NORMALIZATION_MODE:
   1.256 +        opts->normalizationMode = value;
   1.257 +        break;
   1.258 +    case UCOL_STRENGTH:
   1.259 +        opts->strength = value;
   1.260 +        break;
   1.261 +    case UCOL_NUMERIC_COLLATION:
   1.262 +        opts->numericCollation = value;
   1.263 +        break;
   1.264 +    case UCOL_ATTRIBUTE_COUNT:
   1.265 +    default:
   1.266 +        break;
   1.267 +    }
   1.268 +}
   1.269 +
   1.270 +#define UTOK_OPTION_COUNT 22
   1.271 +
   1.272 +static UBool didInit = FALSE;
   1.273 +/* we can be strict, or we can be lenient */
   1.274 +/* I'd surely be lenient with the option arguments */
   1.275 +/* maybe even with options */
   1.276 +U_STRING_DECL(suboption_00, "non-ignorable", 13);
   1.277 +U_STRING_DECL(suboption_01, "shifted",        7);
   1.278 +
   1.279 +U_STRING_DECL(suboption_02, "lower",          5);
   1.280 +U_STRING_DECL(suboption_03, "upper",          5);
   1.281 +U_STRING_DECL(suboption_04, "off",            3);
   1.282 +U_STRING_DECL(suboption_05, "on",             2);
   1.283 +U_STRING_DECL(suboption_06, "1",              1);
   1.284 +U_STRING_DECL(suboption_07, "2",              1);
   1.285 +U_STRING_DECL(suboption_08, "3",              1);
   1.286 +U_STRING_DECL(suboption_09, "4",              1);
   1.287 +U_STRING_DECL(suboption_10, "I",              1);
   1.288 +
   1.289 +U_STRING_DECL(suboption_11, "primary",        7);
   1.290 +U_STRING_DECL(suboption_12, "secondary",      9);
   1.291 +U_STRING_DECL(suboption_13, "tertiary",       8);
   1.292 +U_STRING_DECL(suboption_14, "variable",       8);
   1.293 +U_STRING_DECL(suboption_15, "regular",        7);
   1.294 +U_STRING_DECL(suboption_16, "implicit",       8);
   1.295 +U_STRING_DECL(suboption_17, "trailing",       8);
   1.296 +
   1.297 +
   1.298 +U_STRING_DECL(option_00,    "undefined",      9);
   1.299 +U_STRING_DECL(option_01,    "rearrange",      9);
   1.300 +U_STRING_DECL(option_02,    "alternate",      9);
   1.301 +U_STRING_DECL(option_03,    "backwards",      9);
   1.302 +U_STRING_DECL(option_04,    "variable top",  12);
   1.303 +U_STRING_DECL(option_05,    "top",            3);
   1.304 +U_STRING_DECL(option_06,    "normalization", 13);
   1.305 +U_STRING_DECL(option_07,    "caseLevel",      9);
   1.306 +U_STRING_DECL(option_08,    "caseFirst",      9);
   1.307 +U_STRING_DECL(option_09,    "scriptOrder",   11);
   1.308 +U_STRING_DECL(option_10,    "charsetname",   11);
   1.309 +U_STRING_DECL(option_11,    "charset",        7);
   1.310 +U_STRING_DECL(option_12,    "before",         6);
   1.311 +U_STRING_DECL(option_13,    "hiraganaQ",      9);
   1.312 +U_STRING_DECL(option_14,    "strength",       8);
   1.313 +U_STRING_DECL(option_15,    "first",          5);
   1.314 +U_STRING_DECL(option_16,    "last",           4);
   1.315 +U_STRING_DECL(option_17,    "optimize",       8);
   1.316 +U_STRING_DECL(option_18,    "suppressContractions",         20);
   1.317 +U_STRING_DECL(option_19,    "numericOrdering",              15);
   1.318 +U_STRING_DECL(option_20,    "import",         6);
   1.319 +U_STRING_DECL(option_21,    "reorder",         7);
   1.320 +
   1.321 +/*
   1.322 +[last variable] last variable value
   1.323 +[last primary ignorable] largest CE for primary ignorable
   1.324 +[last secondary ignorable] largest CE for secondary ignorable
   1.325 +[last tertiary ignorable] largest CE for tertiary ignorable
   1.326 +[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
   1.327 +*/
   1.328 +
   1.329 +
   1.330 +static const ucolTokSuboption alternateSub[2] = {
   1.331 +    {suboption_00, 13, UCOL_NON_IGNORABLE},
   1.332 +    {suboption_01,  7, UCOL_SHIFTED}
   1.333 +};
   1.334 +
   1.335 +static const ucolTokSuboption caseFirstSub[3] = {
   1.336 +    {suboption_02, 5, UCOL_LOWER_FIRST},
   1.337 +    {suboption_03,  5, UCOL_UPPER_FIRST},
   1.338 +    {suboption_04,  3, UCOL_OFF},
   1.339 +};
   1.340 +
   1.341 +static const ucolTokSuboption onOffSub[2] = {
   1.342 +    {suboption_04, 3, UCOL_OFF},
   1.343 +    {suboption_05, 2, UCOL_ON}
   1.344 +};
   1.345 +
   1.346 +static const ucolTokSuboption frenchSub[1] = {
   1.347 +    {suboption_07, 1, UCOL_ON}
   1.348 +};
   1.349 +
   1.350 +static const ucolTokSuboption beforeSub[3] = {
   1.351 +    {suboption_06, 1, UCOL_PRIMARY},
   1.352 +    {suboption_07, 1, UCOL_SECONDARY},
   1.353 +    {suboption_08, 1, UCOL_TERTIARY}
   1.354 +};
   1.355 +
   1.356 +static const ucolTokSuboption strengthSub[5] = {
   1.357 +    {suboption_06, 1, UCOL_PRIMARY},
   1.358 +    {suboption_07, 1, UCOL_SECONDARY},
   1.359 +    {suboption_08, 1, UCOL_TERTIARY},
   1.360 +    {suboption_09, 1, UCOL_QUATERNARY},
   1.361 +    {suboption_10, 1, UCOL_IDENTICAL},
   1.362 +};
   1.363 +
   1.364 +static const ucolTokSuboption firstLastSub[7] = {
   1.365 +    {suboption_11, 7, UCOL_PRIMARY},
   1.366 +    {suboption_12, 9, UCOL_PRIMARY},
   1.367 +    {suboption_13, 8, UCOL_PRIMARY},
   1.368 +    {suboption_14, 8, UCOL_PRIMARY},
   1.369 +    {suboption_15, 7, UCOL_PRIMARY},
   1.370 +    {suboption_16, 8, UCOL_PRIMARY},
   1.371 +    {suboption_17, 8, UCOL_PRIMARY},
   1.372 +};
   1.373 +
   1.374 +enum OptionNumber {
   1.375 +    OPTION_ALTERNATE_HANDLING = 0,
   1.376 +    OPTION_FRENCH_COLLATION,
   1.377 +    OPTION_CASE_LEVEL,
   1.378 +    OPTION_CASE_FIRST,
   1.379 +    OPTION_NORMALIZATION_MODE,
   1.380 +    OPTION_HIRAGANA_QUATERNARY,
   1.381 +    OPTION_STRENGTH,
   1.382 +    OPTION_NUMERIC_COLLATION,
   1.383 +    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
   1.384 +    OPTION_VARIABLE_TOP,
   1.385 +    OPTION_REARRANGE,
   1.386 +    OPTION_BEFORE,
   1.387 +    OPTION_TOP,
   1.388 +    OPTION_FIRST,
   1.389 +    OPTION_LAST,
   1.390 +    OPTION_OPTIMIZE,
   1.391 +    OPTION_SUPPRESS_CONTRACTIONS,
   1.392 +    OPTION_UNDEFINED,
   1.393 +    OPTION_SCRIPT_ORDER,
   1.394 +    OPTION_CHARSET_NAME,
   1.395 +    OPTION_CHARSET,
   1.396 +    OPTION_IMPORT,
   1.397 +    OPTION_SCRIPTREORDER
   1.398 +} ;
   1.399 +
   1.400 +static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
   1.401 +    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
   1.402 +    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
   1.403 +    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
   1.404 +    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
   1.405 +    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
   1.406 +    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
   1.407 +    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
   1.408 +    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
   1.409 +    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
   1.410 +    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
   1.411 +    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
   1.412 +    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
   1.413 +    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
   1.414 +    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
   1.415 +    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
   1.416 +    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
   1.417 +    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
   1.418 +    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
   1.419 +    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
   1.420 +    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
   1.421 +    /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
   1.422 +    /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
   1.423 +};
   1.424 +
   1.425 +static
   1.426 +int32_t u_strncmpNoCase(const UChar     *s1,
   1.427 +                        const UChar     *s2,
   1.428 +                        int32_t     n)
   1.429 +{
   1.430 +    if(n > 0) {
   1.431 +        int32_t rc;
   1.432 +        for(;;) {
   1.433 +            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
   1.434 +            if(rc != 0 || *s1 == 0 || --n == 0) {
   1.435 +                return rc;
   1.436 +            }
   1.437 +            ++s1;
   1.438 +            ++s2;
   1.439 +        }
   1.440 +    }
   1.441 +    return 0;
   1.442 +}
   1.443 +
   1.444 +static
   1.445 +void ucol_uprv_tok_initData() {
   1.446 +    if(!didInit) {
   1.447 +        U_STRING_INIT(suboption_00, "non-ignorable", 13);
   1.448 +        U_STRING_INIT(suboption_01, "shifted",        7);
   1.449 +
   1.450 +        U_STRING_INIT(suboption_02, "lower",          5);
   1.451 +        U_STRING_INIT(suboption_03, "upper",          5);
   1.452 +        U_STRING_INIT(suboption_04, "off",            3);
   1.453 +        U_STRING_INIT(suboption_05, "on",             2);
   1.454 +
   1.455 +        U_STRING_INIT(suboption_06, "1",              1);
   1.456 +        U_STRING_INIT(suboption_07, "2",              1);
   1.457 +        U_STRING_INIT(suboption_08, "3",              1);
   1.458 +        U_STRING_INIT(suboption_09, "4",              1);
   1.459 +        U_STRING_INIT(suboption_10, "I",              1);
   1.460 +
   1.461 +        U_STRING_INIT(suboption_11, "primary",        7);
   1.462 +        U_STRING_INIT(suboption_12, "secondary",      9);
   1.463 +        U_STRING_INIT(suboption_13, "tertiary",       8);
   1.464 +        U_STRING_INIT(suboption_14, "variable",       8);
   1.465 +        U_STRING_INIT(suboption_15, "regular",        7);
   1.466 +        U_STRING_INIT(suboption_16, "implicit",       8);
   1.467 +        U_STRING_INIT(suboption_17, "trailing",       8);
   1.468 +
   1.469 +
   1.470 +        U_STRING_INIT(option_00, "undefined",      9);
   1.471 +        U_STRING_INIT(option_01, "rearrange",      9);
   1.472 +        U_STRING_INIT(option_02, "alternate",      9);
   1.473 +        U_STRING_INIT(option_03, "backwards",      9);
   1.474 +        U_STRING_INIT(option_04, "variable top",  12);
   1.475 +        U_STRING_INIT(option_05, "top",            3);
   1.476 +        U_STRING_INIT(option_06, "normalization", 13);
   1.477 +        U_STRING_INIT(option_07, "caseLevel",      9);
   1.478 +        U_STRING_INIT(option_08, "caseFirst",      9);
   1.479 +        U_STRING_INIT(option_09, "scriptOrder",   11);
   1.480 +        U_STRING_INIT(option_10, "charsetname",   11);
   1.481 +        U_STRING_INIT(option_11, "charset",        7);
   1.482 +        U_STRING_INIT(option_12, "before",         6);
   1.483 +        U_STRING_INIT(option_13, "hiraganaQ",      9);
   1.484 +        U_STRING_INIT(option_14, "strength",       8);
   1.485 +        U_STRING_INIT(option_15, "first",          5);
   1.486 +        U_STRING_INIT(option_16, "last",           4);
   1.487 +        U_STRING_INIT(option_17, "optimize",       8);
   1.488 +        U_STRING_INIT(option_18, "suppressContractions",         20);
   1.489 +        U_STRING_INIT(option_19, "numericOrdering",      15);
   1.490 +        U_STRING_INIT(option_20, "import ",        6);
   1.491 +        U_STRING_INIT(option_21, "reorder",        7);
   1.492 +        didInit = TRUE;
   1.493 +    }
   1.494 +}
   1.495 +
   1.496 +
   1.497 +// This function reads basic options to set in the runtime collator
   1.498 +// used by data driven tests. Should not support build time options
   1.499 +U_CAPI const UChar * U_EXPORT2
   1.500 +ucol_tok_getNextArgument(const UChar *start, const UChar *end,
   1.501 +                         UColAttribute *attrib, UColAttributeValue *value,
   1.502 +                         UErrorCode *status)
   1.503 +{
   1.504 +    uint32_t i = 0;
   1.505 +    int32_t j=0;
   1.506 +    UBool foundOption = FALSE;
   1.507 +    const UChar *optionArg = NULL;
   1.508 +
   1.509 +    ucol_uprv_tok_initData();
   1.510 +
   1.511 +    while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
   1.512 +        start++;
   1.513 +    }
   1.514 +    if(start >= end) {
   1.515 +        return NULL;
   1.516 +    }
   1.517 +    /* skip opening '[' */
   1.518 +    if(*start == 0x005b) {
   1.519 +        start++;
   1.520 +    } else {
   1.521 +        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
   1.522 +        return NULL;
   1.523 +    }
   1.524 +
   1.525 +    while(i < UTOK_OPTION_COUNT) {
   1.526 +        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
   1.527 +            foundOption = TRUE;
   1.528 +            if(end - start > rulesOptions[i].optionLen) {
   1.529 +                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
   1.530 +                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
   1.531 +                    optionArg++;
   1.532 +                }
   1.533 +            }
   1.534 +            break;
   1.535 +        }
   1.536 +        i++;
   1.537 +    }
   1.538 +
   1.539 +    if(!foundOption) {
   1.540 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.541 +        return NULL;
   1.542 +    }
   1.543 +
   1.544 +    if(optionArg) {
   1.545 +        for(j = 0; j<rulesOptions[i].subSize; j++) {
   1.546 +            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   1.547 +                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
   1.548 +                *attrib = rulesOptions[i].attr;
   1.549 +                *value = rulesOptions[i].subopts[j].attrVal;
   1.550 +                optionArg += rulesOptions[i].subopts[j].subLen;
   1.551 +                while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
   1.552 +                    optionArg++;
   1.553 +                }
   1.554 +                if(*optionArg == 0x005d) {
   1.555 +                    optionArg++;
   1.556 +                    return optionArg;
   1.557 +                } else {
   1.558 +                    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.559 +                    return NULL;
   1.560 +                }
   1.561 +            }
   1.562 +        }
   1.563 +    }
   1.564 +    *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.565 +    return NULL;
   1.566 +}
   1.567 +
   1.568 +static
   1.569 +USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
   1.570 +    while(*start != 0x005b) { /* advance while we find the first '[' */
   1.571 +        start++;
   1.572 +    }
   1.573 +    // now we need to get a balanced set of '[]'. The problem is that a set can have
   1.574 +    // many, and *end point to the first closing '['
   1.575 +    int32_t noOpenBraces = 1;
   1.576 +    int32_t current = 1; // skip the opening brace
   1.577 +    while(start+current < end && noOpenBraces != 0) {
   1.578 +        if(start[current] == 0x005b) {
   1.579 +            noOpenBraces++;
   1.580 +        } else if(start[current] == 0x005D) { // closing brace
   1.581 +            noOpenBraces--;
   1.582 +        }
   1.583 +        current++;
   1.584 +    }
   1.585 +
   1.586 +    if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
   1.587 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.588 +        return NULL;
   1.589 +    }
   1.590 +    return uset_openPattern(start, current, status);
   1.591 +}
   1.592 +
   1.593 +/**
   1.594 + * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
   1.595 + * @param start Pointer to the start UChar.
   1.596 + * @param end Pointer to the last valid pointer beyond which the option will not extend.
   1.597 + * @param optionArg Address of the pointer at which the options start (after the option name)
   1.598 + * @return The index of the option, or -1 if the option is not valid.
   1.599 + */
   1.600 +static
   1.601 +int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
   1.602 +    int32_t i = 0;
   1.603 +    ucol_uprv_tok_initData();
   1.604 +
   1.605 +    while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
   1.606 +        start++;
   1.607 +    }
   1.608 +    while(i < UTOK_OPTION_COUNT) {
   1.609 +        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
   1.610 +            if(end - start > rulesOptions[i].optionLen) {
   1.611 +                *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
   1.612 +                while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
   1.613 +                    (*optionArg)++;
   1.614 +                }
   1.615 +            }
   1.616 +            break;
   1.617 +        }
   1.618 +        i++;
   1.619 +    }
   1.620 +    if(i == UTOK_OPTION_COUNT) {
   1.621 +        i = -1; // didn't find an option
   1.622 +    }
   1.623 +    return i;
   1.624 +}
   1.625 +
   1.626 +
   1.627 +static
   1.628 +void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
   1.629 +    int32_t codeCount = 0;
   1.630 +    int32_t codeIndex = 0;
   1.631 +    char conversion[64];
   1.632 +    int32_t tokenLength = 0;
   1.633 +    const UChar* space;
   1.634 +    
   1.635 +    const UChar* current = src->current;
   1.636 +    const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
   1.637 +
   1.638 +    // eat leading whitespace
   1.639 +    while(current < end && u_isWhitespace(*current)) {
   1.640 +        current++;
   1.641 +    }
   1.642 +
   1.643 +    while(current < end) {    
   1.644 +        space = u_memchr(current, 0x0020, end - current);
   1.645 +        space = space == 0 ? end : space;
   1.646 +        tokenLength = space - current;
   1.647 +        if (tokenLength < 4) {
   1.648 +            *status = U_INVALID_FORMAT_ERROR;
   1.649 +            return;
   1.650 +        }
   1.651 +        codeCount++;
   1.652 +        current += tokenLength;
   1.653 +        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
   1.654 +            ++current;
   1.655 +        }
   1.656 +    }
   1.657 +
   1.658 +    if (codeCount == 0) {
   1.659 +        *status = U_INVALID_FORMAT_ERROR;
   1.660 +    }
   1.661 +    
   1.662 +    src->reorderCodesLength = codeCount;
   1.663 +    src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
   1.664 +    current = src->current;
   1.665 +    
   1.666 +    // eat leading whitespace
   1.667 +    while(current < end && u_isWhitespace(*current)) {
   1.668 +        current++;
   1.669 +    }
   1.670 +
   1.671 +    while(current < end) {    
   1.672 +        space = u_memchr(current, 0x0020, end - current);
   1.673 +        space = space == 0 ? end : space;
   1.674 +        tokenLength = space - current;
   1.675 +        if (tokenLength < 4) {
   1.676 +            *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.677 +            return;
   1.678 +        } else {
   1.679 +            u_UCharsToChars(current, conversion, tokenLength);
   1.680 +            conversion[tokenLength] = '\0';
   1.681 +            src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
   1.682 +            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
   1.683 +                src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
   1.684 +            }
   1.685 +            if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
   1.686 +                *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.687 +            }
   1.688 +        }
   1.689 +        codeIndex++;
   1.690 +        current += tokenLength;
   1.691 +        while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
   1.692 +            ++current;
   1.693 +        }
   1.694 +    }
   1.695 +}
   1.696 +
   1.697 +// reads and conforms to various options in rules
   1.698 +// end is the position of the first closing ']'
   1.699 +// However, some of the options take an UnicodeSet definition
   1.700 +// which needs to duplicate the closing ']'
   1.701 +// for example: '[copy [\uAC00-\uD7FF]]'
   1.702 +// These options will move end to the second ']' and the
   1.703 +// caller will set the current to it.
   1.704 +static
   1.705 +uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
   1.706 +    const UChar* start = src->current;
   1.707 +    int32_t i = 0;
   1.708 +    int32_t j=0;
   1.709 +    const UChar *optionArg = NULL;
   1.710 +
   1.711 +    uint8_t result = 0;
   1.712 +
   1.713 +    start++; /*skip opening '['*/
   1.714 +    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
   1.715 +    if(optionArg) {
   1.716 +        src->current = optionArg;
   1.717 +    }
   1.718 +
   1.719 +    if(i < 0) {
   1.720 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.721 +    } else {
   1.722 +        int32_t noOpenBraces = 1;
   1.723 +        switch(i) {
   1.724 +    case OPTION_ALTERNATE_HANDLING:
   1.725 +    case OPTION_FRENCH_COLLATION:
   1.726 +    case OPTION_CASE_LEVEL:
   1.727 +    case OPTION_CASE_FIRST:
   1.728 +    case OPTION_NORMALIZATION_MODE:
   1.729 +    case OPTION_HIRAGANA_QUATERNARY:
   1.730 +    case OPTION_STRENGTH:
   1.731 +    case OPTION_NUMERIC_COLLATION:
   1.732 +        if(optionArg) {
   1.733 +            for(j = 0; j<rulesOptions[i].subSize; j++) {
   1.734 +                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   1.735 +                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
   1.736 +                    result =  UCOL_TOK_SUCCESS;
   1.737 +                }
   1.738 +            }
   1.739 +        }
   1.740 +        if(result == 0) {
   1.741 +            *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.742 +        }
   1.743 +        break;
   1.744 +    case OPTION_VARIABLE_TOP:
   1.745 +        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
   1.746 +        break;
   1.747 +    case OPTION_REARRANGE:
   1.748 +        result = UCOL_TOK_SUCCESS;
   1.749 +        break;
   1.750 +    case OPTION_BEFORE:
   1.751 +        if(optionArg) {
   1.752 +            for(j = 0; j<rulesOptions[i].subSize; j++) {
   1.753 +                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   1.754 +                    result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
   1.755 +                }
   1.756 +            }
   1.757 +        }
   1.758 +        if(result == 0) {
   1.759 +            *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.760 +        }
   1.761 +        break;
   1.762 +    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
   1.763 +        /* index to this array will be src->parsedToken.indirectIndex*/
   1.764 +        src->parsedToken.indirectIndex = 0;
   1.765 +        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
   1.766 +        break;
   1.767 +    case OPTION_FIRST:
   1.768 +    case OPTION_LAST: /* first, last */
   1.769 +        for(j = 0; j<rulesOptions[i].subSize; j++) {
   1.770 +            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   1.771 +                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
   1.772 +                // element of indirect boundaries is reserved for top.
   1.773 +                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
   1.774 +                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
   1.775 +            }
   1.776 +        }
   1.777 +        if(result == 0) {
   1.778 +            *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.779 +        }
   1.780 +        break;
   1.781 +    case OPTION_OPTIMIZE:
   1.782 +    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
   1.783 +        // we need to move end here
   1.784 +        src->current++; // skip opening brace
   1.785 +        while(src->current < src->end && noOpenBraces != 0) {
   1.786 +            if(*src->current == 0x005b) {
   1.787 +                noOpenBraces++;
   1.788 +            } else if(*src->current == 0x005D) { // closing brace
   1.789 +                noOpenBraces--;
   1.790 +            }
   1.791 +            src->current++;
   1.792 +        }
   1.793 +        result = UCOL_TOK_SUCCESS;
   1.794 +        break;
   1.795 +    case OPTION_SCRIPTREORDER:
   1.796 +        ucol_tok_parseScriptReorder(src, status);
   1.797 +        break;
   1.798 +    default:
   1.799 +        *status = U_UNSUPPORTED_ERROR;
   1.800 +        break;
   1.801 +        }
   1.802 +    }
   1.803 +    src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
   1.804 +    return result;
   1.805 +}
   1.806 +
   1.807 +
   1.808 +inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
   1.809 +    if (stuff == NULL || len <= 0) {
   1.810 +        return;
   1.811 +    }
   1.812 +    UnicodeString tempStuff(FALSE, stuff, len);
   1.813 +    if(src->extraCurrent+len >= src->extraEnd) {
   1.814 +        /* reallocate */
   1.815 +        if (stuff >= src->source && stuff <= src->end) {
   1.816 +            // Copy the "stuff" contents into tempStuff's own buffer.
   1.817 +            // UnicodeString is copy-on-write.
   1.818 +            if (len > 0) {
   1.819 +                tempStuff.setCharAt(0, tempStuff[0]);
   1.820 +            } else {
   1.821 +                tempStuff.remove();
   1.822 +            }
   1.823 +        }
   1.824 +        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
   1.825 +        if(newSrc != NULL) {
   1.826 +            src->current = newSrc + (src->current - src->source);
   1.827 +            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
   1.828 +            src->end = newSrc + (src->end - src->source);
   1.829 +            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
   1.830 +            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
   1.831 +            src->source = newSrc;
   1.832 +        } else {
   1.833 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.834 +            return;
   1.835 +        }
   1.836 +    }
   1.837 +    if(len == 1) {
   1.838 +        *src->extraCurrent++ = tempStuff[0];
   1.839 +    } else {
   1.840 +        u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
   1.841 +        src->extraCurrent += len;
   1.842 +    }
   1.843 +}
   1.844 +
   1.845 +inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
   1.846 +    /*
   1.847 +    top = TRUE;
   1.848 +    */
   1.849 +    UChar buff[5];
   1.850 +    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1.851 +    buff[0] = 0xFFFE;
   1.852 +    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
   1.853 +    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
   1.854 +    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
   1.855 +        src->parsedToken.charsLen = 3;
   1.856 +        ucol_tok_addToExtraCurrent(src, buff, 3, status);
   1.857 +    } else {
   1.858 +        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
   1.859 +        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
   1.860 +        src->parsedToken.charsLen = 5;
   1.861 +        ucol_tok_addToExtraCurrent(src, buff, 5, status);
   1.862 +    }
   1.863 +    return TRUE;
   1.864 +}
   1.865 +
   1.866 +static UBool isCharNewLine(UChar c){
   1.867 +    switch(c){
   1.868 +    case 0x000A: /* LF  */
   1.869 +    case 0x000D: /* CR  */
   1.870 +    case 0x000C: /* FF  */
   1.871 +    case 0x0085: /* NEL */
   1.872 +    case 0x2028: /* LS  */
   1.873 +    case 0x2029: /* PS  */
   1.874 +        return TRUE;
   1.875 +    default:
   1.876 +        return FALSE;
   1.877 +    }
   1.878 +}
   1.879 +
   1.880 +/*
   1.881 + * This function is called several times when a range is processed.  Each time, the next code point
   1.882 + * is processed.
   1.883 + * The following variables must be set before calling this function:
   1.884 + *   src->currentRangeCp:  The current code point to process.
   1.885 + *   src->lastRangeCp: The last code point in the range.
   1.886 + * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
   1.887 + */
   1.888 +static const UChar*
   1.889 +ucol_tok_processNextCodePointInRange(UColTokenParser *src,
   1.890 +                                     UErrorCode *status)
   1.891 +{
   1.892 +  // Append current code point to source
   1.893 +  UChar buff[U16_MAX_LENGTH];
   1.894 +  uint32_t i = 0;
   1.895 +
   1.896 +  uint32_t nChars = U16_LENGTH(src->currentRangeCp);
   1.897 +  src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   1.898 +  src->parsedToken.charsLen = nChars;
   1.899 +
   1.900 +  U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
   1.901 +  ucol_tok_addToExtraCurrent(src, buff, nChars, status);
   1.902 +
   1.903 +  ++src->currentRangeCp;
   1.904 +  if (src->currentRangeCp > src->lastRangeCp) {
   1.905 +    src->inRange = FALSE;
   1.906 +
   1.907 +    if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
   1.908 +      src->isStarred = FALSE;
   1.909 +    }
   1.910 +  } else {
   1.911 +    src->previousCp = src->currentRangeCp;
   1.912 +  }
   1.913 +  return src->current;
   1.914 +}
   1.915 +
   1.916 +/*
   1.917 + * This function is called several times when a starred list is processed.  Each time, the next code point
   1.918 + * in the list is processed.
   1.919 + * The following variables must be set before calling this function:
   1.920 + *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
   1.921 + *   src->lastStarredCharIndex: Index to the last character in the list.
   1.922 + * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
   1.923 + */
   1.924 +static const UChar*
   1.925 +ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
   1.926 +{
   1.927 +  // Extract the characters corresponding to the next code point.
   1.928 +  UChar32 cp;
   1.929 +  src->parsedToken.charsOffset = src->currentStarredCharIndex;
   1.930 +  int32_t prev = src->currentStarredCharIndex;
   1.931 +  U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
   1.932 +  src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
   1.933 +
   1.934 +  // When we are done parsing the starred string, turn the flag off so that
   1.935 +  // the normal processing is restored.
   1.936 +  if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
   1.937 +    src->isStarred = FALSE;
   1.938 +  }
   1.939 +  src->previousCp = cp;
   1.940 +  return src->current;
   1.941 +}
   1.942 +
   1.943 +/*
   1.944 + * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
   1.945 + *
   1.946 + * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
   1.947 + *  # : Comment character
   1.948 + *  & : Reset operator
   1.949 + *  = : Equality
   1.950 + *  < : Primary collation
   1.951 + *  << : Secondary collation
   1.952 + *  <<< : Tertiary collation
   1.953 + *  ; : Secondary collation
   1.954 + *  , : Tertiary collation
   1.955 + *  / : Expansions
   1.956 + *  | : Prefix
   1.957 + *  - : Range
   1.958 +
   1.959 + *  ! : Java Thai modifier, ignored
   1.960 + *  @ : French only
   1.961 +
   1.962 + * [] : Options
   1.963 + * '' : Quotes
   1.964 + *
   1.965 + *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
   1.966 + *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
   1.967 + *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
   1.968 + *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
   1.969 + *  character returned as cached so that the calling program can do further splitting.
   1.970 + */
   1.971 +static const UChar*
   1.972 +ucol_tok_parseNextTokenInternal(UColTokenParser *src,
   1.973 +                                UBool startOfRules,
   1.974 +                                UParseError *parseError,
   1.975 +                                UErrorCode *status)
   1.976 +{
   1.977 +    UBool variableTop = FALSE;
   1.978 +    UBool top = FALSE;
   1.979 +    UBool inChars = TRUE;
   1.980 +    UBool inQuote = FALSE;
   1.981 +    UBool wasInQuote = FALSE;
   1.982 +    uint8_t before = 0;
   1.983 +    UBool isEscaped = FALSE;
   1.984 +
   1.985 +    // TODO: replace these variables with src->parsedToken counterparts
   1.986 +    // no need to use them anymore since we have src->parsedToken.
   1.987 +    // Ideally, token parser would be a nice class... Once, when I have
   1.988 +    // more time (around 2020 probably).
   1.989 +    uint32_t newExtensionLen = 0;
   1.990 +    uint32_t extensionOffset = 0;
   1.991 +    uint32_t newStrength = UCOL_TOK_UNSET;
   1.992 +    UChar buff[10];
   1.993 +
   1.994 +    src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
   1.995 +    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
   1.996 +    src->parsedToken.indirectIndex = 0;
   1.997 +
   1.998 +    while (src->current < src->end) {
   1.999 +        UChar ch = *(src->current);
  1.1000 +
  1.1001 +        if (inQuote) {
  1.1002 +            if (ch == 0x0027/*'\''*/) {
  1.1003 +                inQuote = FALSE;
  1.1004 +            } else {
  1.1005 +                if ((src->parsedToken.charsLen == 0) || inChars) {
  1.1006 +                    if(src->parsedToken.charsLen == 0) {
  1.1007 +                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1008 +                    }
  1.1009 +                    src->parsedToken.charsLen++;
  1.1010 +                } else {
  1.1011 +                    if(newExtensionLen == 0) {
  1.1012 +                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1013 +                    }
  1.1014 +                    newExtensionLen++;
  1.1015 +                }
  1.1016 +            }
  1.1017 +        }else if(isEscaped){
  1.1018 +            isEscaped =FALSE;
  1.1019 +            if (newStrength == UCOL_TOK_UNSET) {
  1.1020 +                *status = U_INVALID_FORMAT_ERROR;
  1.1021 +                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1022 +                DBG_FORMAT_ERROR
  1.1023 +                return NULL;
  1.1024 +                // enabling rules to start with non-tokens a < b
  1.1025 +                // newStrength = UCOL_TOK_RESET;
  1.1026 +            }
  1.1027 +            if(ch != 0x0000  && src->current != src->end) {
  1.1028 +                if (inChars) {
  1.1029 +                    if(src->parsedToken.charsLen == 0) {
  1.1030 +                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
  1.1031 +                    }
  1.1032 +                    src->parsedToken.charsLen++;
  1.1033 +                } else {
  1.1034 +                    if(newExtensionLen == 0) {
  1.1035 +                        extensionOffset = (uint32_t)(src->current - src->source);
  1.1036 +                    }
  1.1037 +                    newExtensionLen++;
  1.1038 +                }
  1.1039 +            }
  1.1040 +        }else {
  1.1041 +            if(!PatternProps::isWhiteSpace(ch)) {
  1.1042 +                /* Sets the strength for this entry */
  1.1043 +                switch (ch) {
  1.1044 +                case 0x003D/*'='*/ :
  1.1045 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1046 +                        goto EndOfLoop;
  1.1047 +                    }
  1.1048 +
  1.1049 +                    /* if we start with strength, we'll reset to top */
  1.1050 +                    if(startOfRules == TRUE) {
  1.1051 +                        src->parsedToken.indirectIndex = 5;
  1.1052 +                        top = ucol_tok_doSetTop(src, status);
  1.1053 +                        newStrength = UCOL_TOK_RESET;
  1.1054 +                        goto EndOfLoop;
  1.1055 +                    }
  1.1056 +                    newStrength = UCOL_IDENTICAL;
  1.1057 +                    if(*(src->current+1) == 0x002A) {/*'*'*/
  1.1058 +                        src->current++;
  1.1059 +                        src->isStarred = TRUE;
  1.1060 +                    }
  1.1061 +                    break;
  1.1062 +
  1.1063 +                case 0x002C/*','*/:
  1.1064 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1065 +                        goto EndOfLoop;
  1.1066 +                    }
  1.1067 +
  1.1068 +                    /* if we start with strength, we'll reset to top */
  1.1069 +                    if(startOfRules == TRUE) {
  1.1070 +                        src->parsedToken.indirectIndex = 5;
  1.1071 +                        top = ucol_tok_doSetTop(src, status);
  1.1072 +                        newStrength = UCOL_TOK_RESET;
  1.1073 +                        goto EndOfLoop;
  1.1074 +                    }
  1.1075 +                    newStrength = UCOL_TERTIARY;
  1.1076 +                    break;
  1.1077 +
  1.1078 +                case  0x003B/*';'*/:
  1.1079 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1080 +                        goto EndOfLoop;
  1.1081 +                    }
  1.1082 +
  1.1083 +                    /* if we start with strength, we'll reset to top */
  1.1084 +                    if(startOfRules == TRUE) {
  1.1085 +                        src->parsedToken.indirectIndex = 5;
  1.1086 +                        top = ucol_tok_doSetTop(src, status);
  1.1087 +                        newStrength = UCOL_TOK_RESET;
  1.1088 +                        goto EndOfLoop;
  1.1089 +                    }
  1.1090 +                    newStrength = UCOL_SECONDARY;
  1.1091 +                    break;
  1.1092 +
  1.1093 +                case 0x003C/*'<'*/:
  1.1094 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1095 +                        goto EndOfLoop;
  1.1096 +                    }
  1.1097 +
  1.1098 +                    /* if we start with strength, we'll reset to top */
  1.1099 +                    if(startOfRules == TRUE) {
  1.1100 +                        src->parsedToken.indirectIndex = 5;
  1.1101 +                        top = ucol_tok_doSetTop(src, status);
  1.1102 +                        newStrength = UCOL_TOK_RESET;
  1.1103 +                        goto EndOfLoop;
  1.1104 +                    }
  1.1105 +                    /* before this, do a scan to verify whether this is */
  1.1106 +                    /* another strength */
  1.1107 +                    if(*(src->current+1) == 0x003C) {
  1.1108 +                        src->current++;
  1.1109 +                        if(*(src->current+1) == 0x003C) {
  1.1110 +                            src->current++; /* three in a row! */
  1.1111 +                            newStrength = UCOL_TERTIARY;
  1.1112 +                        } else { /* two in a row */
  1.1113 +                            newStrength = UCOL_SECONDARY;
  1.1114 +                        }
  1.1115 +                    } else { /* just one */
  1.1116 +                        newStrength = UCOL_PRIMARY;
  1.1117 +                    }
  1.1118 +                    if(*(src->current+1) == 0x002A) {/*'*'*/
  1.1119 +                        src->current++;
  1.1120 +                        src->isStarred = TRUE;
  1.1121 +                    }
  1.1122 +                    break;
  1.1123 +
  1.1124 +                case 0x0026/*'&'*/:
  1.1125 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1126 +                        /**/
  1.1127 +                        goto EndOfLoop;
  1.1128 +                    }
  1.1129 +
  1.1130 +                    newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
  1.1131 +                    break;
  1.1132 +
  1.1133 +                case 0x005b/*'['*/:
  1.1134 +                    /* options - read an option, analyze it */
  1.1135 +                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
  1.1136 +                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
  1.1137 +                        if(U_SUCCESS(*status)) {
  1.1138 +                            if(result & UCOL_TOK_TOP) {
  1.1139 +                                if(newStrength == UCOL_TOK_RESET) {
  1.1140 +                                    top = ucol_tok_doSetTop(src, status);
  1.1141 +                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
  1.1142 +                                        src->parsedToken.charsLen+=2;
  1.1143 +                                        buff[0] = 0x002d;
  1.1144 +                                        buff[1] = before;
  1.1145 +                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
  1.1146 +                                    }
  1.1147 +
  1.1148 +                                    src->current++;
  1.1149 +                                    goto EndOfLoop;
  1.1150 +                                } else {
  1.1151 +                                    *status = U_INVALID_FORMAT_ERROR;
  1.1152 +                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1153 +                                    DBG_FORMAT_ERROR
  1.1154 +                                }
  1.1155 +                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
  1.1156 +                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
  1.1157 +                                    variableTop = TRUE;
  1.1158 +                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1159 +                                    src->parsedToken.charsLen = 1;
  1.1160 +                                    buff[0] = 0xFFFF;
  1.1161 +                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
  1.1162 +                                    src->current++;
  1.1163 +                                    goto EndOfLoop;
  1.1164 +                                } else {
  1.1165 +                                    *status = U_INVALID_FORMAT_ERROR;
  1.1166 +                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1167 +                                    DBG_FORMAT_ERROR
  1.1168 +                                }
  1.1169 +                            } else if (result & UCOL_TOK_BEFORE){
  1.1170 +                                if(newStrength == UCOL_TOK_RESET) {
  1.1171 +                                    before = result & UCOL_TOK_BEFORE;
  1.1172 +                                } else {
  1.1173 +                                    *status = U_INVALID_FORMAT_ERROR;
  1.1174 +                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1175 +                                    DBG_FORMAT_ERROR
  1.1176 +                                }
  1.1177 +                            }
  1.1178 +                        } else {
  1.1179 +                            *status = U_INVALID_FORMAT_ERROR;
  1.1180 +                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1181 +                            DBG_FORMAT_ERROR
  1.1182 +                            return NULL;
  1.1183 +                        }
  1.1184 +                    }
  1.1185 +                    break;
  1.1186 +                case 0x0021/*! skip java thai modifier reordering*/:
  1.1187 +                    break;
  1.1188 +                case 0x002F/*'/'*/:
  1.1189 +                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
  1.1190 +                    inChars = FALSE; /* we're now processing expansion */
  1.1191 +                    break;
  1.1192 +                case 0x005C /* back slash for escaped chars */:
  1.1193 +                    isEscaped = TRUE;
  1.1194 +                    break;
  1.1195 +                    /* found a quote, we're gonna start copying */
  1.1196 +                case 0x0027/*'\''*/:
  1.1197 +                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
  1.1198 +                      *status = U_INVALID_FORMAT_ERROR;
  1.1199 +                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1200 +                      DBG_FORMAT_ERROR
  1.1201 +                      return NULL;
  1.1202 +                      // enabling rules to start with a non-token character a < b
  1.1203 +                      // newStrength = UCOL_TOK_RESET;
  1.1204 +                    }
  1.1205 +
  1.1206 +                    inQuote = TRUE;
  1.1207 +
  1.1208 +                    if(inChars) { /* we're doing characters */
  1.1209 +                        if(wasInQuote == FALSE) {
  1.1210 +                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1211 +                        }
  1.1212 +                        if (src->parsedToken.charsLen != 0) {
  1.1213 +                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
  1.1214 +                        }
  1.1215 +                        src->parsedToken.charsLen++;
  1.1216 +                    } else { /* we're doing an expansion */
  1.1217 +                        if(wasInQuote == FALSE) {
  1.1218 +                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1219 +                        }
  1.1220 +                        if (newExtensionLen != 0) {
  1.1221 +                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
  1.1222 +                        }
  1.1223 +                        newExtensionLen++;
  1.1224 +                    }
  1.1225 +
  1.1226 +                    wasInQuote = TRUE;
  1.1227 +
  1.1228 +                    ch = *(++(src->current));
  1.1229 +                    if(ch == 0x0027) { /* copy the double quote */
  1.1230 +                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
  1.1231 +                        inQuote = FALSE;
  1.1232 +                    }
  1.1233 +                    break;
  1.1234 +
  1.1235 +                    /* '@' is french only if the strength is not currently set */
  1.1236 +                    /* if it is, it's just a regular character in collation rules */
  1.1237 +                case 0x0040/*'@'*/:
  1.1238 +                    if (newStrength == UCOL_TOK_UNSET) {
  1.1239 +                        src->opts->frenchCollation = UCOL_ON;
  1.1240 +                        break;
  1.1241 +                    }
  1.1242 +
  1.1243 +                case 0x007C /*|*/: /* this means we have actually been reading prefix part */
  1.1244 +                    // we want to store read characters to the prefix part and continue reading
  1.1245 +                    // the characters (proper way would be to restart reading the chars, but in
  1.1246 +                    // that case we would have to complicate the token hasher, which I do not
  1.1247 +                    // intend to play with. Instead, we will do prefixes when prefixes are due
  1.1248 +                    // (before adding the elements).
  1.1249 +                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
  1.1250 +                    src->parsedToken.prefixLen = src->parsedToken.charsLen;
  1.1251 +
  1.1252 +                    if(inChars) { /* we're doing characters */
  1.1253 +                        if(wasInQuote == FALSE) {
  1.1254 +                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1255 +                        }
  1.1256 +                        if (src->parsedToken.charsLen != 0) {
  1.1257 +                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
  1.1258 +                        }
  1.1259 +                        src->parsedToken.charsLen++;
  1.1260 +                    }
  1.1261 +
  1.1262 +                    wasInQuote = TRUE;
  1.1263 +
  1.1264 +                    do {
  1.1265 +                        ch = *(++(src->current));
  1.1266 +                        // skip whitespace between '|' and the character
  1.1267 +                    } while (PatternProps::isWhiteSpace(ch));
  1.1268 +                    break;
  1.1269 +
  1.1270 +                    //charsOffset = 0;
  1.1271 +                    //newCharsLen = 0;
  1.1272 +                    //break; // We want to store the whole prefix/character sequence. If we break
  1.1273 +                    // the '|' is going to get lost.
  1.1274 +
  1.1275 +                case 0x002D /*-*/: /* A range. */
  1.1276 +                    if (newStrength != UCOL_TOK_UNSET) {
  1.1277 +                      // While processing the pending token, the isStarred field
  1.1278 +                      // is reset, so it needs to be saved for the next
  1.1279 +                      // invocation.
  1.1280 +                      src->savedIsStarred = src->isStarred;
  1.1281 +                      goto EndOfLoop;
  1.1282 +                   }
  1.1283 +                   src->isStarred = src->savedIsStarred;
  1.1284 +
  1.1285 +                   // Ranges are valid only in starred tokens.
  1.1286 +                   if (!src->isStarred) {
  1.1287 +                     *status = U_INVALID_FORMAT_ERROR;
  1.1288 +                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1289 +                     DBG_FORMAT_ERROR
  1.1290 +                     return NULL;
  1.1291 +                   }
  1.1292 +                   newStrength = src->parsedToken.strength;
  1.1293 +                   src->inRange = TRUE;
  1.1294 +                   break;
  1.1295 +
  1.1296 +                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
  1.1297 +                    do {
  1.1298 +                        ch = *(++(src->current));
  1.1299 +                    } while (!isCharNewLine(ch));
  1.1300 +
  1.1301 +                    break;
  1.1302 +                default:
  1.1303 +                    if (newStrength == UCOL_TOK_UNSET) {
  1.1304 +                      *status = U_INVALID_FORMAT_ERROR;
  1.1305 +                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1306 +                      DBG_FORMAT_ERROR
  1.1307 +                      return NULL;
  1.1308 +                    }
  1.1309 +
  1.1310 +                    if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
  1.1311 +                        *status = U_INVALID_FORMAT_ERROR;
  1.1312 +                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1313 +                        DBG_FORMAT_ERROR
  1.1314 +                        return NULL;
  1.1315 +                    }
  1.1316 +
  1.1317 +                    if(ch == 0x0000 && src->current+1 == src->end) {
  1.1318 +                        break;
  1.1319 +                    }
  1.1320 +
  1.1321 +                    if (inChars) {
  1.1322 +                        if(src->parsedToken.charsLen == 0) {
  1.1323 +                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
  1.1324 +                        }
  1.1325 +                        src->parsedToken.charsLen++;
  1.1326 +                    } else {
  1.1327 +                        if(newExtensionLen == 0) {
  1.1328 +                            extensionOffset = (uint32_t)(src->current - src->source);
  1.1329 +                        }
  1.1330 +                        newExtensionLen++;
  1.1331 +                    }
  1.1332 +
  1.1333 +                    break;
  1.1334 +                }
  1.1335 +            }
  1.1336 +        }
  1.1337 +
  1.1338 +        if(wasInQuote) {
  1.1339 +            if(ch != 0x27) {
  1.1340 +                if(inQuote || !PatternProps::isWhiteSpace(ch)) {
  1.1341 +                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
  1.1342 +                }
  1.1343 +            }
  1.1344 +        }
  1.1345 +
  1.1346 +        src->current++;
  1.1347 +    }
  1.1348 +
  1.1349 +EndOfLoop:
  1.1350 +    wasInQuote = FALSE;
  1.1351 +    if (newStrength == UCOL_TOK_UNSET) {
  1.1352 +        return NULL;
  1.1353 +    }
  1.1354 +
  1.1355 +    if (src->parsedToken.charsLen == 0 && top == FALSE) {
  1.1356 +        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1.1357 +        *status = U_INVALID_FORMAT_ERROR;
  1.1358 +        DBG_FORMAT_ERROR
  1.1359 +        return NULL;
  1.1360 +    }
  1.1361 +
  1.1362 +    src->parsedToken.strength = newStrength;
  1.1363 +    src->parsedToken.extensionOffset = extensionOffset;
  1.1364 +    src->parsedToken.extensionLen = newExtensionLen;
  1.1365 +    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
  1.1366 +
  1.1367 +    return src->current;
  1.1368 +}
  1.1369 +
  1.1370 +/*
  1.1371 + * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
  1.1372 + * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
  1.1373 + *
  1.1374 + * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
  1.1375 + *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
  1.1376 + *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
  1.1377 + *     cached as member variables of the token parser.
  1.1378 + *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
  1.1379 + *     starting character as a single list token (which is separated into individual characters here)
  1.1380 + *     and as another list token starting with the last character in the range.  Before expanding it
  1.1381 + *     as a list of tokens, this function expands the range by filling the intermediate characters and
  1.1382 + *     returns them one by one as separate tokens.
  1.1383 + * Necessary checks are done for invalid combinations.
  1.1384 + */
  1.1385 +U_CAPI const UChar* U_EXPORT2
  1.1386 +ucol_tok_parseNextToken(UColTokenParser *src,
  1.1387 +                        UBool startOfRules,
  1.1388 +                        UParseError *parseError,
  1.1389 +                        UErrorCode *status)
  1.1390 +{
  1.1391 +  const UChar *nextToken;
  1.1392 +
  1.1393 +  if (src->inRange) {
  1.1394 +    // We are not done processing a range.  Continue it.
  1.1395 +    return ucol_tok_processNextCodePointInRange(src, status);
  1.1396 +  } else if (src->isStarred) {
  1.1397 +    // We are not done processing a starred token.  Continue it.
  1.1398 +    return ucol_tok_processNextTokenInStarredList(src);
  1.1399 +  }
  1.1400 +
  1.1401 +  // Get the next token.
  1.1402 +  nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
  1.1403 +
  1.1404 +  if (nextToken == NULL) {
  1.1405 +    return NULL;
  1.1406 +  }
  1.1407 +
  1.1408 +  if (src->inRange) {
  1.1409 +    // A new range has started.
  1.1410 +    // Check whether it is a chain of ranges with more than one hyphen.
  1.1411 +    if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
  1.1412 +        *status = U_INVALID_FORMAT_ERROR;
  1.1413 +        syntaxError(src->source,src->parsedToken.charsOffset-1,
  1.1414 +                    src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
  1.1415 +        DBG_FORMAT_ERROR
  1.1416 +        return NULL;
  1.1417 +    }
  1.1418 +
  1.1419 +    // The current token indicates the second code point of the range.
  1.1420 +    // Process just that, and then proceed with the star.
  1.1421 +    src->currentStarredCharIndex = src->parsedToken.charsOffset;
  1.1422 +    U16_NEXT(src->source, src->currentStarredCharIndex, 
  1.1423 +             (uint32_t)(src->end - src->source), src->lastRangeCp);
  1.1424 +    if (src->lastRangeCp <= src->previousCp) {
  1.1425 +        *status = U_INVALID_FORMAT_ERROR;
  1.1426 +        syntaxError(src->source,src->parsedToken.charsOffset-1,
  1.1427 +                    src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
  1.1428 +        DBG_FORMAT_ERROR
  1.1429 +        return NULL;
  1.1430 +    }
  1.1431 +
  1.1432 +    // Set current range code point to process the range loop
  1.1433 +    src->currentRangeCp = src->previousCp + 1;
  1.1434 +
  1.1435 +    src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
  1.1436 +
  1.1437 +    return ucol_tok_processNextCodePointInRange(src, status);
  1.1438 + } else if (src->isStarred) {
  1.1439 +    // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
  1.1440 +    // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
  1.1441 +    // separated into several tokens and returned.
  1.1442 +    src->currentStarredCharIndex = src->parsedToken.charsOffset;
  1.1443 +    src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
  1.1444 +
  1.1445 +    return ucol_tok_processNextTokenInStarredList(src);
  1.1446 +  } else {
  1.1447 +    // Set previous codepoint
  1.1448 +    U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
  1.1449 +  }
  1.1450 +  return nextToken;
  1.1451 +}
  1.1452 +
  1.1453 +
  1.1454 +/*
  1.1455 +Processing Description
  1.1456 +1 Build a ListList. Each list has a header, which contains two lists (positive
  1.1457 +and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
  1.1458 +reset may be null.
  1.1459 +2 As you process, you keep a LAST pointer that points to the last token you
  1.1460 +handled.
  1.1461 +
  1.1462 +*/
  1.1463 +
  1.1464 +static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
  1.1465 +                                      UParseError *parseError, UErrorCode *status)
  1.1466 +{
  1.1467 +    if(src->resultLen == src->listCapacity) {
  1.1468 +        // Unfortunately, this won't work, as we store addresses of lhs in token
  1.1469 +        src->listCapacity *= 2;
  1.1470 +        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
  1.1471 +        if(src->lh == NULL) {
  1.1472 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.1473 +            return NULL;
  1.1474 +        }
  1.1475 +    }
  1.1476 +    /* do the reset thing */
  1.1477 +    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
  1.1478 +    /* test for NULL */
  1.1479 +    if (sourceToken == NULL) {
  1.1480 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.1481 +        return NULL;
  1.1482 +    }
  1.1483 +    sourceToken->rulesToParseHdl = &(src->source);
  1.1484 +    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1.1485 +    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
  1.1486 +
  1.1487 +    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
  1.1488 +    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
  1.1489 +
  1.1490 +    // keep the flags around so that we know about before
  1.1491 +    sourceToken->flags = src->parsedToken.flags;
  1.1492 +
  1.1493 +    if(src->parsedToken.prefixOffset != 0) {
  1.1494 +        // this is a syntax error
  1.1495 +        *status = U_INVALID_FORMAT_ERROR;
  1.1496 +        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
  1.1497 +        DBG_FORMAT_ERROR
  1.1498 +        uprv_free(sourceToken);
  1.1499 +        return 0;
  1.1500 +    } else {
  1.1501 +        sourceToken->prefix = 0;
  1.1502 +    }
  1.1503 +
  1.1504 +    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
  1.1505 +    sourceToken->strength = UCOL_TOK_RESET;
  1.1506 +    sourceToken->next = NULL;
  1.1507 +    sourceToken->previous = NULL;
  1.1508 +    sourceToken->noOfCEs = 0;
  1.1509 +    sourceToken->noOfExpCEs = 0;
  1.1510 +    sourceToken->listHeader = &src->lh[src->resultLen];
  1.1511 +
  1.1512 +    src->lh[src->resultLen].first = NULL;
  1.1513 +    src->lh[src->resultLen].last = NULL;
  1.1514 +    src->lh[src->resultLen].first = NULL;
  1.1515 +    src->lh[src->resultLen].last = NULL;
  1.1516 +
  1.1517 +    src->lh[src->resultLen].reset = sourceToken;
  1.1518 +
  1.1519 +    /*
  1.1520 +    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
  1.1521 +    First convert all expansions into normal form. Examples:
  1.1522 +    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
  1.1523 +    d * ... into &x * c/y * d * ...
  1.1524 +    Note: reset values can never have expansions, although they can cause the
  1.1525 +    very next item to have one. They may be contractions, if they are found
  1.1526 +    earlier in the list.
  1.1527 +    */
  1.1528 +    *expandNext = 0;
  1.1529 +    if(expand != NULL) {
  1.1530 +        /* check to see if there is an expansion */
  1.1531 +        if(src->parsedToken.charsLen > 1) {
  1.1532 +            uint32_t resetCharsOffset;
  1.1533 +            resetCharsOffset = (uint32_t)(expand - src->source);
  1.1534 +            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
  1.1535 +            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
  1.1536 +        }
  1.1537 +    }
  1.1538 +
  1.1539 +    src->resultLen++;
  1.1540 +
  1.1541 +    uhash_put(src->tailored, sourceToken, sourceToken, status);
  1.1542 +
  1.1543 +    return sourceToken;
  1.1544 +}
  1.1545 +
  1.1546 +static
  1.1547 +inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
  1.1548 +    if(U_FAILURE(*status)) {
  1.1549 +        return NULL;
  1.1550 +    }
  1.1551 +    /* this is a virgin before - we need to fish the anchor from the UCA */
  1.1552 +    collIterate s;
  1.1553 +    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
  1.1554 +    uint32_t CE, SecondCE;
  1.1555 +    // uint32_t invPos;
  1.1556 +    if(sourceToken != NULL) {
  1.1557 +        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
  1.1558 +    } else {
  1.1559 +        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
  1.1560 +    }
  1.1561 +    if(U_FAILURE(*status)) {
  1.1562 +        return NULL;
  1.1563 +    }
  1.1564 +
  1.1565 +    baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
  1.1566 +    baseContCE = ucol_getNextCE(src->UCA, &s, status);
  1.1567 +    if(baseContCE == UCOL_NO_MORE_CES) {
  1.1568 +        baseContCE = 0;
  1.1569 +    }
  1.1570 +
  1.1571 +
  1.1572 +    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  1.1573 +    uint32_t ch = 0;
  1.1574 +    uint32_t expandNext = 0;
  1.1575 +    UColToken key;
  1.1576 +
  1.1577 +    if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
  1.1578 +        uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
  1.1579 +        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
  1.1580 +        ch = uprv_uca_getCodePointFromRaw(raw-1);
  1.1581 +        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
  1.1582 +        CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
  1.1583 +        SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
  1.1584 +
  1.1585 +        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1.1586 +        *src->extraCurrent++ = 0xFFFE;
  1.1587 +        *src->extraCurrent++ = (UChar)ch;
  1.1588 +        src->parsedToken.charsLen++;
  1.1589 +
  1.1590 +        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
  1.1591 +        key.rulesToParseHdl = &(src->source);
  1.1592 +
  1.1593 +        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
  1.1594 +        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1.1595 +
  1.1596 +        if(sourceToken == NULL) {
  1.1597 +            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  1.1598 +            if(isContinuation(SecondCE)) {
  1.1599 +                src->lh[src->resultLen].baseContCE = SecondCE;
  1.1600 +            } else {
  1.1601 +                src->lh[src->resultLen].baseContCE = 0;
  1.1602 +            }
  1.1603 +            src->lh[src->resultLen].nextCE = 0;
  1.1604 +            src->lh[src->resultLen].nextContCE = 0;
  1.1605 +            src->lh[src->resultLen].previousCE = 0;
  1.1606 +            src->lh[src->resultLen].previousContCE = 0;
  1.1607 +
  1.1608 +            src->lh[src->resultLen].indirect = FALSE;
  1.1609 +
  1.1610 +            sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1.1611 +        }
  1.1612 +
  1.1613 +    } else {
  1.1614 +        /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
  1.1615 +
  1.1616 +        // we got the previous CE. Now we need to see if the difference between
  1.1617 +        // the two CEs is really of the requested strength.
  1.1618 +        // if it's a bigger difference (we asked for secondary and got primary), we
  1.1619 +        // need to modify the CE.
  1.1620 +        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
  1.1621 +            // adjust the strength
  1.1622 +            // now we are in the situation where our baseCE should actually be modified in
  1.1623 +            // order to get the CE in the right position.
  1.1624 +            if(strength == UCOL_SECONDARY) {
  1.1625 +                CE = baseCE - 0x0200;
  1.1626 +            } else { // strength == UCOL_TERTIARY
  1.1627 +                CE = baseCE - 0x02;
  1.1628 +            }
  1.1629 +            if(baseContCE) {
  1.1630 +                if(strength == UCOL_SECONDARY) {
  1.1631 +                    SecondCE = baseContCE - 0x0200;
  1.1632 +                } else { // strength == UCOL_TERTIARY
  1.1633 +                    SecondCE = baseContCE - 0x02;
  1.1634 +                }
  1.1635 +            }
  1.1636 +        }
  1.1637 +
  1.1638 +#if 0
  1.1639 +        // the code below relies on getting a code point from the inverse table, in order to be
  1.1640 +        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
  1.1641 +        // 1. There are many code points that have the same CE
  1.1642 +        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
  1.1643 +        // Also, in case when there is no equivalent strength before an element, we have to actually
  1.1644 +        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
  1.1645 +        // before a is a primary difference.
  1.1646 +
  1.1647 +        //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
  1.1648 +
  1.1649 +
  1.1650 +        ch = CETable[3*invPos+2];
  1.1651 +
  1.1652 +        if((ch &  UCOL_INV_SIZEMASK) != 0) {
  1.1653 +            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
  1.1654 +            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
  1.1655 +            ch = conts[offset];
  1.1656 +        }
  1.1657 +
  1.1658 +        *src->extraCurrent++ = (UChar)ch;
  1.1659 +        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
  1.1660 +        src->parsedToken.charsLen = 1;
  1.1661 +
  1.1662 +        // We got an UCA before. However, this might have been tailored.
  1.1663 +        // example:
  1.1664 +        // &\u30ca = \u306a
  1.1665 +        // &[before 3]\u306a<<<\u306a|\u309d
  1.1666 +
  1.1667 +
  1.1668 +        // uint32_t key = (*newCharsLen << 24) | *charsOffset;
  1.1669 +        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
  1.1670 +        key.rulesToParseHdl = &(src->source);
  1.1671 +
  1.1672 +        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
  1.1673 +        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1.1674 +#endif
  1.1675 +
  1.1676 +        // here is how it should be. The situation such as &[before 1]a < x, should be
  1.1677 +        // resolved exactly as if we wrote &a > x.
  1.1678 +        // therefore, I don't really care if the UCA value before a has been changed.
  1.1679 +        // However, I do care if the strength between my element and the previous element
  1.1680 +        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
  1.1681 +        // have to construct the base CE.
  1.1682 +
  1.1683 +
  1.1684 +
  1.1685 +        // if we found a tailored thing, we have to use the UCA value and construct
  1.1686 +        // a new reset token with constructed name
  1.1687 +        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
  1.1688 +        // character to which we want to anchor is already tailored.
  1.1689 +        // We need to construct a new token which will be the anchor
  1.1690 +        // point
  1.1691 +        //*(src->extraCurrent-1) = 0xFFFE;
  1.1692 +        //*src->extraCurrent++ = (UChar)ch;
  1.1693 +        // grab before
  1.1694 +        src->parsedToken.charsOffset -= 10;
  1.1695 +        src->parsedToken.charsLen += 10;
  1.1696 +        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  1.1697 +        if(isContinuation(SecondCE)) {
  1.1698 +            src->lh[src->resultLen].baseContCE = SecondCE;
  1.1699 +        } else {
  1.1700 +            src->lh[src->resultLen].baseContCE = 0;
  1.1701 +        }
  1.1702 +        src->lh[src->resultLen].nextCE = 0;
  1.1703 +        src->lh[src->resultLen].nextContCE = 0;
  1.1704 +        src->lh[src->resultLen].previousCE = 0;
  1.1705 +        src->lh[src->resultLen].previousContCE = 0;
  1.1706 +
  1.1707 +        src->lh[src->resultLen].indirect = FALSE;
  1.1708 +
  1.1709 +        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1.1710 +        //}
  1.1711 +    }
  1.1712 +
  1.1713 +    return sourceToken;
  1.1714 +
  1.1715 +}
  1.1716 +
  1.1717 +uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
  1.1718 +    UColToken *lastToken = NULL;
  1.1719 +    const UChar *parseEnd = NULL;
  1.1720 +    uint32_t expandNext = 0;
  1.1721 +    UBool variableTop = FALSE;
  1.1722 +    UBool top = FALSE;
  1.1723 +    uint16_t specs = 0;
  1.1724 +    UColTokListHeader *ListList = NULL;
  1.1725 +
  1.1726 +    src->parsedToken.strength = UCOL_TOK_UNSET;
  1.1727 +
  1.1728 +    ListList = src->lh;
  1.1729 +
  1.1730 +    if(U_FAILURE(*status)) {
  1.1731 +        return 0;
  1.1732 +    }
  1.1733 +#ifdef DEBUG_FOR_CODE_POINTS
  1.1734 +    char filename[35];
  1.1735 +    sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
  1.1736 +    dfcp_fp = fopen(filename, "a");
  1.1737 +    fprintf(stdout, "Output is in the file %s.\n", filename);
  1.1738 +#endif
  1.1739 +
  1.1740 +#ifdef DEBUG_FOR_COLL_RULES
  1.1741 +    std::string s3;
  1.1742 +    UnicodeString(src->source).toUTF8String(s3);
  1.1743 +    std::cout << "src->source = " << s3 << std::endl;
  1.1744 +#endif
  1.1745 +
  1.1746 +    while(src->current < src->end || src->isStarred) {
  1.1747 +        src->parsedToken.prefixOffset = 0;
  1.1748 +
  1.1749 +        parseEnd = ucol_tok_parseNextToken(src,
  1.1750 +            (UBool)(lastToken == NULL),
  1.1751 +            parseError,
  1.1752 +            status);
  1.1753 +
  1.1754 +        specs = src->parsedToken.flags;
  1.1755 +
  1.1756 +
  1.1757 +        variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
  1.1758 +        top = ((specs & UCOL_TOK_TOP) != 0);
  1.1759 +
  1.1760 +        if(U_SUCCESS(*status) && parseEnd != NULL) {
  1.1761 +            UColToken *sourceToken = NULL;
  1.1762 +            //uint32_t key = 0;
  1.1763 +            uint32_t lastStrength = UCOL_TOK_UNSET;
  1.1764 +
  1.1765 +            if(lastToken != NULL ) {
  1.1766 +                lastStrength = lastToken->strength;
  1.1767 +            }
  1.1768 +
  1.1769 +#ifdef DEBUG_FOR_CODE_POINTS
  1.1770 +            UChar32 cp;
  1.1771 +            U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
  1.1772 +            fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
  1.1773 +#endif
  1.1774 +            //key = newCharsLen << 24 | charsOffset;
  1.1775 +            UColToken key;
  1.1776 +            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1.1777 +            key.rulesToParseHdl = &(src->source);
  1.1778 +
  1.1779 +            /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
  1.1780 +            sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1.1781 +
  1.1782 +            if(src->parsedToken.strength != UCOL_TOK_RESET) {
  1.1783 +                if(lastToken == NULL) { /* this means that rules haven't started properly */
  1.1784 +                    *status = U_INVALID_FORMAT_ERROR;
  1.1785 +                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
  1.1786 +                    DBG_FORMAT_ERROR
  1.1787 +                    return 0;
  1.1788 +                }
  1.1789 +                /*  6 Otherwise (when relation != reset) */
  1.1790 +                if(sourceToken == NULL) {
  1.1791 +                    /* If sourceToken is null, create new one, */
  1.1792 +                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
  1.1793 +                    /* test for NULL */
  1.1794 +                    if (sourceToken == NULL) {
  1.1795 +                        *status = U_MEMORY_ALLOCATION_ERROR;
  1.1796 +                        return 0;
  1.1797 +                    }
  1.1798 +                    sourceToken->rulesToParseHdl = &(src->source);
  1.1799 +                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1.1800 +
  1.1801 +                    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
  1.1802 +
  1.1803 +                    sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
  1.1804 +                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
  1.1805 +
  1.1806 +                    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
  1.1807 +                    sourceToken->next = NULL;
  1.1808 +                    sourceToken->previous = NULL;
  1.1809 +                    sourceToken->noOfCEs = 0;
  1.1810 +                    sourceToken->noOfExpCEs = 0;
  1.1811 +                    // keep the flags around so that we know about before
  1.1812 +                    sourceToken->flags = src->parsedToken.flags;
  1.1813 +                    uhash_put(src->tailored, sourceToken, sourceToken, status);
  1.1814 +                    if(U_FAILURE(*status)) {
  1.1815 +                        return 0;
  1.1816 +                    }
  1.1817 +                } else {
  1.1818 +                    /* we could have fished out a reset here */
  1.1819 +                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
  1.1820 +                        /* otherwise remove sourceToken from where it was. */
  1.1821 +                        if(sourceToken->next != NULL) {
  1.1822 +                            if(sourceToken->next->strength > sourceToken->strength) {
  1.1823 +                                sourceToken->next->strength = sourceToken->strength;
  1.1824 +                            }
  1.1825 +                            sourceToken->next->previous = sourceToken->previous;
  1.1826 +                        } else {
  1.1827 +                            sourceToken->listHeader->last = sourceToken->previous;
  1.1828 +                        }
  1.1829 +
  1.1830 +                        if(sourceToken->previous != NULL) {
  1.1831 +                            sourceToken->previous->next = sourceToken->next;
  1.1832 +                        } else {
  1.1833 +                            sourceToken->listHeader->first = sourceToken->next;
  1.1834 +                        }
  1.1835 +                        sourceToken->next = NULL;
  1.1836 +                        sourceToken->previous = NULL;
  1.1837 +                    }
  1.1838 +                }
  1.1839 +
  1.1840 +                sourceToken->strength = src->parsedToken.strength;
  1.1841 +                sourceToken->listHeader = lastToken->listHeader;
  1.1842 +
  1.1843 +                /*
  1.1844 +                1.  Find the strongest strength in each list, and set strongestP and strongestN
  1.1845 +                accordingly in the headers.
  1.1846 +                */
  1.1847 +                if(lastStrength == UCOL_TOK_RESET
  1.1848 +                    || sourceToken->listHeader->first == 0) {
  1.1849 +                        /* If LAST is a reset
  1.1850 +                        insert sourceToken in the list. */
  1.1851 +                        if(sourceToken->listHeader->first == 0) {
  1.1852 +                            sourceToken->listHeader->first = sourceToken;
  1.1853 +                            sourceToken->listHeader->last = sourceToken;
  1.1854 +                        } else { /* we need to find a place for us */
  1.1855 +                            /* and we'll get in front of the same strength */
  1.1856 +                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
  1.1857 +                                sourceToken->next = sourceToken->listHeader->first;
  1.1858 +                                sourceToken->next->previous = sourceToken;
  1.1859 +                                sourceToken->listHeader->first = sourceToken;
  1.1860 +                                sourceToken->previous = NULL;
  1.1861 +                            } else {
  1.1862 +                                lastToken = sourceToken->listHeader->first;
  1.1863 +                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
  1.1864 +                                    lastToken = lastToken->next;
  1.1865 +                                }
  1.1866 +                                if(lastToken->next != NULL) {
  1.1867 +                                    lastToken->next->previous = sourceToken;
  1.1868 +                                } else {
  1.1869 +                                    sourceToken->listHeader->last = sourceToken;
  1.1870 +                                }
  1.1871 +                                sourceToken->previous = lastToken;
  1.1872 +                                sourceToken->next = lastToken->next;
  1.1873 +                                lastToken->next = sourceToken;
  1.1874 +                            }
  1.1875 +                        }
  1.1876 +                    } else {
  1.1877 +                        /* Otherwise (when LAST is not a reset)
  1.1878 +                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
  1.1879 +                        otherwise insert before.
  1.1880 +                        when inserting after or before, search to the next position with the same
  1.1881 +                        strength in that direction. (This is called postpone insertion).         */
  1.1882 +                        if(sourceToken != lastToken) {
  1.1883 +                            if(lastToken->polarity == sourceToken->polarity) {
  1.1884 +                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
  1.1885 +                                    lastToken = lastToken->next;
  1.1886 +                                }
  1.1887 +                                sourceToken->previous = lastToken;
  1.1888 +                                if(lastToken->next != NULL) {
  1.1889 +                                    lastToken->next->previous = sourceToken;
  1.1890 +                                } else {
  1.1891 +                                    sourceToken->listHeader->last = sourceToken;
  1.1892 +                                }
  1.1893 +
  1.1894 +                                sourceToken->next = lastToken->next;
  1.1895 +                                lastToken->next = sourceToken;
  1.1896 +                            } else {
  1.1897 +                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
  1.1898 +                                    lastToken = lastToken->previous;
  1.1899 +                                }
  1.1900 +                                sourceToken->next = lastToken;
  1.1901 +                                if(lastToken->previous != NULL) {
  1.1902 +                                    lastToken->previous->next = sourceToken;
  1.1903 +                                } else {
  1.1904 +                                    sourceToken->listHeader->first = sourceToken;
  1.1905 +                                }
  1.1906 +                                sourceToken->previous = lastToken->previous;
  1.1907 +                                lastToken->previous = sourceToken;
  1.1908 +                            }
  1.1909 +                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
  1.1910 +                            if(lastStrength < sourceToken->strength) {
  1.1911 +                                sourceToken->strength = lastStrength;
  1.1912 +                            }
  1.1913 +                        }
  1.1914 +                    }
  1.1915 +
  1.1916 +                    /* if the token was a variable top, we're gonna put it in */
  1.1917 +                    if(variableTop == TRUE && src->varTop == NULL) {
  1.1918 +                        variableTop = FALSE;
  1.1919 +                        src->varTop = sourceToken;
  1.1920 +                    }
  1.1921 +
  1.1922 +                    // Treat the expansions.
  1.1923 +                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
  1.1924 +                    // (&abc * d * e <=> &ab * d / c * e / c)
  1.1925 +                    // if both of them are in effect for a token, they are combined.
  1.1926 +
  1.1927 +                    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
  1.1928 +
  1.1929 +                    if(expandNext != 0) {
  1.1930 +                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
  1.1931 +                            expandNext = 0;
  1.1932 +                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
  1.1933 +                            sourceToken->expansion = expandNext;
  1.1934 +                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
  1.1935 +                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
  1.1936 +                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
  1.1937 +                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
  1.1938 +                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
  1.1939 +                        }
  1.1940 +                    }
  1.1941 +
  1.1942 +                    // This is just for debugging purposes
  1.1943 +                    if(sourceToken->expansion != 0) {
  1.1944 +                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
  1.1945 +                    } else {
  1.1946 +                        sourceToken->debugExpansion = 0;
  1.1947 +                    }
  1.1948 +                    // if the previous token was a reset before, the strength of this
  1.1949 +                    // token must match the strength of before. Otherwise we have an
  1.1950 +                    // undefined situation.
  1.1951 +                    // In other words, we currently have a cludge which we use to
  1.1952 +                    // represent &a >> x. This is written as &[before 2]a << x.
  1.1953 +                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
  1.1954 +                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
  1.1955 +                        if(beforeStrength != sourceToken->strength) {
  1.1956 +                            *status = U_INVALID_FORMAT_ERROR;
  1.1957 +                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
  1.1958 +                            DBG_FORMAT_ERROR
  1.1959 +                            return 0;
  1.1960 +                        }
  1.1961 +                    }
  1.1962 +            } else {
  1.1963 +                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
  1.1964 +                    /* if the previous token was also a reset, */
  1.1965 +                    /*this means that we have two consecutive resets */
  1.1966 +                    /* and we want to remove the previous one if empty*/
  1.1967 +                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
  1.1968 +                        src->resultLen--;
  1.1969 +                    }
  1.1970 +                }
  1.1971 +
  1.1972 +                if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
  1.1973 +                    uint32_t searchCharsLen = src->parsedToken.charsLen;
  1.1974 +                    while(searchCharsLen > 1 && sourceToken == NULL) {
  1.1975 +                        searchCharsLen--;
  1.1976 +                        //key = searchCharsLen << 24 | charsOffset;
  1.1977 +                        UColToken key;
  1.1978 +                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
  1.1979 +                        key.rulesToParseHdl = &(src->source);
  1.1980 +                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1.1981 +                    }
  1.1982 +                    if(sourceToken != NULL) {
  1.1983 +                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
  1.1984 +                    }
  1.1985 +                }
  1.1986 +
  1.1987 +                if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
  1.1988 +                    if(top == FALSE) { /* there is no indirection */
  1.1989 +                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
  1.1990 +                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
  1.1991 +                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
  1.1992 +                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
  1.1993 +                                sourceToken = sourceToken->previous;
  1.1994 +                            }
  1.1995 +                            /* here, either we hit the strength or NULL */
  1.1996 +                            if(sourceToken->strength == strength) {
  1.1997 +                                if(sourceToken->previous != NULL) {
  1.1998 +                                    sourceToken = sourceToken->previous;
  1.1999 +                                } else { /* start of list */
  1.2000 +                                    sourceToken = sourceToken->listHeader->reset;
  1.2001 +                                }
  1.2002 +                            } else { /* we hit NULL */
  1.2003 +                                /* we should be doing the else part */
  1.2004 +                                sourceToken = sourceToken->listHeader->reset;
  1.2005 +                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
  1.2006 +                            }
  1.2007 +                        } else {
  1.2008 +                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
  1.2009 +                        }
  1.2010 +                    } else { /* this is both before and indirection */
  1.2011 +                        top = FALSE;
  1.2012 +                        ListList[src->resultLen].previousCE = 0;
  1.2013 +                        ListList[src->resultLen].previousContCE = 0;
  1.2014 +                        ListList[src->resultLen].indirect = TRUE;
  1.2015 +                        /* we need to do slightly more work. we need to get the baseCE using the */
  1.2016 +                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
  1.2017 +                        /* in ucol_bld */
  1.2018 +                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
  1.2019 +                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
  1.2020 +                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
  1.2021 +                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
  1.2022 +
  1.2023 +                        UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  1.2024 +                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && 
  1.2025 +                           (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
  1.2026 +                            uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
  1.2027 +                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
  1.2028 +                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
  1.2029 +                            CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
  1.2030 +                            SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
  1.2031 +                        } else {
  1.2032 +                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
  1.2033 +                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
  1.2034 +                        }
  1.2035 +
  1.2036 +                        ListList[src->resultLen].baseCE = CE;
  1.2037 +                        ListList[src->resultLen].baseContCE = SecondCE;
  1.2038 +                        ListList[src->resultLen].nextCE = 0;
  1.2039 +                        ListList[src->resultLen].nextContCE = 0;
  1.2040 +
  1.2041 +                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1.2042 +                    }
  1.2043 +                }
  1.2044 +
  1.2045 +
  1.2046 +                /*  5 If the relation is a reset:
  1.2047 +                If sourceToken is null
  1.2048 +                Create new list, create new sourceToken, make the baseCE from source, put
  1.2049 +                the sourceToken in ListHeader of the new list */
  1.2050 +                if(sourceToken == NULL) {
  1.2051 +                    /*
  1.2052 +                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
  1.2053 +                    First convert all expansions into normal form. Examples:
  1.2054 +                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
  1.2055 +                    d * ... into &x * c/y * d * ...
  1.2056 +                    Note: reset values can never have expansions, although they can cause the
  1.2057 +                    very next item to have one. They may be contractions, if they are found
  1.2058 +                    earlier in the list.
  1.2059 +                    */
  1.2060 +                    if(top == FALSE) {
  1.2061 +                        collIterate s;
  1.2062 +                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
  1.2063 +
  1.2064 +                        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
  1.2065 +
  1.2066 +                        CE = ucol_getNextCE(src->UCA, &s, status);
  1.2067 +                        const UChar *expand = s.pos;
  1.2068 +                        SecondCE = ucol_getNextCE(src->UCA, &s, status);
  1.2069 +
  1.2070 +                        ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  1.2071 +                        if(isContinuation(SecondCE)) {
  1.2072 +                            ListList[src->resultLen].baseContCE = SecondCE;
  1.2073 +                        } else {
  1.2074 +                            ListList[src->resultLen].baseContCE = 0;
  1.2075 +                        }
  1.2076 +                        ListList[src->resultLen].nextCE = 0;
  1.2077 +                        ListList[src->resultLen].nextContCE = 0;
  1.2078 +                        ListList[src->resultLen].previousCE = 0;
  1.2079 +                        ListList[src->resultLen].previousContCE = 0;
  1.2080 +                        ListList[src->resultLen].indirect = FALSE;
  1.2081 +                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
  1.2082 +                    } else { /* top == TRUE */
  1.2083 +                        /* just use the supplied values */
  1.2084 +                        top = FALSE;
  1.2085 +                        ListList[src->resultLen].previousCE = 0;
  1.2086 +                        ListList[src->resultLen].previousContCE = 0;
  1.2087 +                        ListList[src->resultLen].indirect = TRUE;
  1.2088 +                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
  1.2089 +                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
  1.2090 +                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
  1.2091 +                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
  1.2092 +
  1.2093 +                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1.2094 +
  1.2095 +                    }
  1.2096 +                } else { /* reset to something already in rules */
  1.2097 +                    top = FALSE;
  1.2098 +                }
  1.2099 +            }
  1.2100 +            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
  1.2101 +            lastToken = sourceToken;
  1.2102 +        } else {
  1.2103 +            if(U_FAILURE(*status)) {
  1.2104 +                return 0;
  1.2105 +            }
  1.2106 +        }
  1.2107 +    }
  1.2108 +#ifdef DEBUG_FOR_CODE_POINTS
  1.2109 +    fclose(dfcp_fp);
  1.2110 +#endif
  1.2111 +
  1.2112 +
  1.2113 +    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
  1.2114 +        src->resultLen--;
  1.2115 +    }
  1.2116 +    return src->resultLen;
  1.2117 +}
  1.2118 +
  1.2119 +const UChar* ucol_tok_getRulesFromBundle(
  1.2120 +    void* /*context*/,
  1.2121 +    const char* locale,
  1.2122 +    const char* type,
  1.2123 +    int32_t* pLength,
  1.2124 +    UErrorCode* status)
  1.2125 +{
  1.2126 +    const UChar* rules = NULL;
  1.2127 +    UResourceBundle* bundle;
  1.2128 +    UResourceBundle* collations;
  1.2129 +    UResourceBundle* collation;
  1.2130 +
  1.2131 +    *pLength = 0;
  1.2132 +
  1.2133 +    bundle = ures_open(U_ICUDATA_COLL, locale, status);
  1.2134 +    if(U_SUCCESS(*status)){
  1.2135 +        collations = ures_getByKey(bundle, "collations", NULL, status);
  1.2136 +        if(U_SUCCESS(*status)){
  1.2137 +            collation = ures_getByKey(collations, type, NULL, status);
  1.2138 +            if(U_SUCCESS(*status)){
  1.2139 +                rules = ures_getStringByKey(collation, "Sequence", pLength, status);
  1.2140 +                if(U_FAILURE(*status)){
  1.2141 +                    *pLength = 0;
  1.2142 +                    rules = NULL;
  1.2143 +                }
  1.2144 +                ures_close(collation);
  1.2145 +            }
  1.2146 +            ures_close(collations);
  1.2147 +        }
  1.2148 +    }
  1.2149 +
  1.2150 +    ures_close(bundle);
  1.2151 +
  1.2152 +    return rules;
  1.2153 +}
  1.2154 +
  1.2155 +void ucol_tok_initTokenList(
  1.2156 +    UColTokenParser *src,
  1.2157 +    const UChar *rules,
  1.2158 +    uint32_t rulesLength,
  1.2159 +    const UCollator *UCA,
  1.2160 +    GetCollationRulesFunction importFunc,
  1.2161 +    void* context, 
  1.2162 +    UErrorCode *status) {
  1.2163 +    U_NAMESPACE_USE
  1.2164 +
  1.2165 +    uint32_t nSize = 0;
  1.2166 +    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
  1.2167 +
  1.2168 +    bool needToDeallocRules = false;
  1.2169 +
  1.2170 +    if(U_FAILURE(*status)) {
  1.2171 +        return;
  1.2172 +    }
  1.2173 +
  1.2174 +    // set everything to zero, so that we can clean up gracefully
  1.2175 +    uprv_memset(src, 0, sizeof(UColTokenParser));
  1.2176 +
  1.2177 +    // first we need to find options that don't like to be normalized,
  1.2178 +    // like copy and remove...
  1.2179 +    //const UChar *openBrace = rules;
  1.2180 +    int32_t optionNumber = -1;
  1.2181 +    const UChar *setStart = NULL;
  1.2182 +    uint32_t i = 0;
  1.2183 +    while(i < rulesLength) {
  1.2184 +        if(rules[i] == 0x005B) {    // '[': start of an option
  1.2185 +            /* Gets the following:
  1.2186 +               optionNumber: The index of the option.
  1.2187 +               setStart: The pointer at which the option arguments start.
  1.2188 +             */
  1.2189 +            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
  1.2190 +
  1.2191 +            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
  1.2192 +                // [optimize]
  1.2193 +                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
  1.2194 +                if(U_SUCCESS(*status)) {
  1.2195 +                    if(src->copySet == NULL) {
  1.2196 +                        src->copySet = newSet;
  1.2197 +                    } else {
  1.2198 +                        uset_addAll(src->copySet, newSet);
  1.2199 +                        uset_close(newSet);
  1.2200 +                    }
  1.2201 +                } else {
  1.2202 +                    return;
  1.2203 +                }
  1.2204 +            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
  1.2205 +                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
  1.2206 +                if(U_SUCCESS(*status)) {
  1.2207 +                    if(src->removeSet == NULL) {
  1.2208 +                        src->removeSet = newSet;
  1.2209 +                    } else {
  1.2210 +                        uset_addAll(src->removeSet, newSet);
  1.2211 +                        uset_close(newSet);
  1.2212 +                    }
  1.2213 +                } else {
  1.2214 +                    return;
  1.2215 +                }
  1.2216 +            } else if(optionNumber == OPTION_IMPORT){
  1.2217 +                // [import <collation-name>]
  1.2218 +
  1.2219 +                // Find the address of the closing ].
  1.2220 +                UChar* import_end = u_strchr(setStart, 0x005D);
  1.2221 +                int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
  1.2222 +                // Ignore trailing whitespace.
  1.2223 +                while(PatternProps::isWhiteSpace(*(import_end-1))) {
  1.2224 +                    --import_end;
  1.2225 +                }
  1.2226 +
  1.2227 +                int32_t optionLength = (int32_t)(import_end - setStart);
  1.2228 +                char option[50];
  1.2229 +                if(optionLength >= (int32_t)sizeof(option)) {
  1.2230 +                    *status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2231 +                    return;
  1.2232 +                }
  1.2233 +                u_UCharsToChars(setStart, option, optionLength);
  1.2234 +                option[optionLength] = 0;
  1.2235 +
  1.2236 +                *status = U_ZERO_ERROR;
  1.2237 +                char locale[50];
  1.2238 +                int32_t templ;
  1.2239 +                uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
  1.2240 +                if(U_FAILURE(*status)) {
  1.2241 +                    *status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2242 +                    return;
  1.2243 +                }
  1.2244 +
  1.2245 +                char type[50];
  1.2246 +                if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
  1.2247 +                    U_FAILURE(*status)
  1.2248 +                ) {
  1.2249 +                    *status = U_ZERO_ERROR;
  1.2250 +                    uprv_strcpy(type, "standard");
  1.2251 +                }
  1.2252 +
  1.2253 +                // TODO: Use public functions when available, see ticket #8134.
  1.2254 +                char *keywords = (char *)locale_getKeywordsStart(locale);
  1.2255 +                if(keywords != NULL) {
  1.2256 +                    *keywords = 0;
  1.2257 +                }
  1.2258 +
  1.2259 +                int32_t importRulesLength = 0;
  1.2260 +                const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
  1.2261 +
  1.2262 +#ifdef DEBUG_FOR_COLL_RULES
  1.2263 +                std::string s;
  1.2264 +                UnicodeString(importRules).toUTF8String(s);
  1.2265 +                std::cout << "Import rules = " << s << std::endl;
  1.2266 +#endif
  1.2267 +
  1.2268 +                // Add the length of the imported rules to length of the original rules,
  1.2269 +                // and subtract the length of the import option.
  1.2270 +                uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
  1.2271 +
  1.2272 +                UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
  1.2273 +
  1.2274 +#ifdef DEBUG_FOR_COLL_RULES
  1.2275 +                std::string s1;
  1.2276 +                UnicodeString(rules).toUTF8String(s1);
  1.2277 +                std::cout << "Original rules = " << s1 << std::endl;
  1.2278 +#endif
  1.2279 +
  1.2280 +
  1.2281 +                // Copy the section of the original rules leading up to the import
  1.2282 +                uprv_memcpy(newRules, rules, i*sizeof(UChar));
  1.2283 +                // Copy the imported rules
  1.2284 +                uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
  1.2285 +                // Copy the rest of the original rules (minus the import option itself)
  1.2286 +                uprv_memcpy(newRules+i+importRulesLength,
  1.2287 +                            rules+optionEndOffset,
  1.2288 +                            (rulesLength-optionEndOffset)*sizeof(UChar));
  1.2289 +
  1.2290 +#ifdef DEBUG_FOR_COLL_RULES
  1.2291 +                std::string s2;
  1.2292 +                UnicodeString(newRules).toUTF8String(s2);
  1.2293 +                std::cout << "Resulting rules = " << s2 << std::endl;
  1.2294 +#endif
  1.2295 +
  1.2296 +                if(needToDeallocRules){
  1.2297 +                    // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
  1.2298 +                    uprv_free((void*)rules);
  1.2299 +                }
  1.2300 +                needToDeallocRules = true;
  1.2301 +                rules = newRules;
  1.2302 +                rulesLength = newRulesLength;
  1.2303 +
  1.2304 +                estimatedSize += importRulesLength*2;
  1.2305 +
  1.2306 +                // First character of the new rules needs to be processed
  1.2307 +                i--;
  1.2308 +            }
  1.2309 +        }
  1.2310 +        //openBrace++;
  1.2311 +        i++;
  1.2312 +    }
  1.2313 +
  1.2314 +    src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
  1.2315 +    /* test for NULL */
  1.2316 +    if (src->source == NULL) {
  1.2317 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.2318 +        return;
  1.2319 +    }
  1.2320 +    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
  1.2321 +    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
  1.2322 +    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
  1.2323 +        *status = U_ZERO_ERROR;
  1.2324 +        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
  1.2325 +        /* test for NULL */
  1.2326 +        if (src->source == NULL) {
  1.2327 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.2328 +            return;
  1.2329 +        }
  1.2330 +        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
  1.2331 +    }
  1.2332 +    if(needToDeallocRules){
  1.2333 +        // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
  1.2334 +        uprv_free((void*)rules);
  1.2335 +    }
  1.2336 +
  1.2337 +
  1.2338 +    src->current = src->source;
  1.2339 +    src->end = src->source+nSize;
  1.2340 +    src->sourceCurrent = src->source;
  1.2341 +    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
  1.2342 +    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  1.2343 +    src->varTop = NULL;
  1.2344 +    src->UCA = UCA;
  1.2345 +    src->invUCA = ucol_initInverseUCA(status);
  1.2346 +    src->parsedToken.charsLen = 0;
  1.2347 +    src->parsedToken.charsOffset = 0;
  1.2348 +    src->parsedToken.extensionLen = 0;
  1.2349 +    src->parsedToken.extensionOffset = 0;
  1.2350 +    src->parsedToken.prefixLen = 0;
  1.2351 +    src->parsedToken.prefixOffset = 0;
  1.2352 +    src->parsedToken.flags = 0;
  1.2353 +    src->parsedToken.strength = UCOL_TOK_UNSET;
  1.2354 +    src->buildCCTabFlag = FALSE;
  1.2355 +    src->isStarred = FALSE;
  1.2356 +    src->inRange = FALSE;
  1.2357 +    src->lastRangeCp = 0;
  1.2358 +    src->previousCp = 0;
  1.2359 +
  1.2360 +    if(U_FAILURE(*status)) {
  1.2361 +        return;
  1.2362 +    }
  1.2363 +    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
  1.2364 +    if(U_FAILURE(*status)) {
  1.2365 +        return;
  1.2366 +    }
  1.2367 +    uhash_setValueDeleter(src->tailored, uprv_free);
  1.2368 +
  1.2369 +    src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
  1.2370 +    /* test for NULL */
  1.2371 +    if (src->opts == NULL) {
  1.2372 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.2373 +        return;
  1.2374 +    }
  1.2375 +
  1.2376 +    uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
  1.2377 +
  1.2378 +    src->lh = 0;
  1.2379 +    src->listCapacity = 1024;
  1.2380 +    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
  1.2381 +    //Test for NULL
  1.2382 +    if (src->lh == NULL) {
  1.2383 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.2384 +        return;
  1.2385 +    }
  1.2386 +    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
  1.2387 +    src->resultLen = 0;
  1.2388 +
  1.2389 +    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  1.2390 +
  1.2391 +    // UCOL_RESET_TOP_VALUE
  1.2392 +    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
  1.2393 +    // UCOL_FIRST_PRIMARY_IGNORABLE
  1.2394 +    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
  1.2395 +    // UCOL_LAST_PRIMARY_IGNORABLE
  1.2396 +    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
  1.2397 +    // UCOL_FIRST_SECONDARY_IGNORABLE
  1.2398 +    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
  1.2399 +    // UCOL_LAST_SECONDARY_IGNORABLE
  1.2400 +    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
  1.2401 +    // UCOL_FIRST_TERTIARY_IGNORABLE
  1.2402 +    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
  1.2403 +    // UCOL_LAST_TERTIARY_IGNORABLE
  1.2404 +    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
  1.2405 +    // UCOL_FIRST_VARIABLE
  1.2406 +    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
  1.2407 +    // UCOL_LAST_VARIABLE
  1.2408 +    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
  1.2409 +    // UCOL_FIRST_NON_VARIABLE
  1.2410 +    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
  1.2411 +    // UCOL_LAST_NON_VARIABLE
  1.2412 +    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
  1.2413 +    // UCOL_FIRST_IMPLICIT
  1.2414 +    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
  1.2415 +    // UCOL_LAST_IMPLICIT
  1.2416 +    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
  1.2417 +    // UCOL_FIRST_TRAILING
  1.2418 +    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
  1.2419 +    // UCOL_LAST_TRAILING
  1.2420 +    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
  1.2421 +    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
  1.2422 +}
  1.2423 +
  1.2424 +
  1.2425 +void ucol_tok_closeTokenList(UColTokenParser *src) {
  1.2426 +    if(src->copySet != NULL) {
  1.2427 +        uset_close(src->copySet);
  1.2428 +    }
  1.2429 +    if(src->removeSet != NULL) {
  1.2430 +        uset_close(src->removeSet);
  1.2431 +    }
  1.2432 +    if(src->tailored != NULL) {
  1.2433 +        uhash_close(src->tailored);
  1.2434 +    }
  1.2435 +    if(src->lh != NULL) {
  1.2436 +        uprv_free(src->lh);
  1.2437 +    }
  1.2438 +    if(src->source != NULL) {
  1.2439 +        uprv_free(src->source);
  1.2440 +    }
  1.2441 +    if(src->opts != NULL) {
  1.2442 +        uprv_free(src->opts);
  1.2443 +    }
  1.2444 +    if (src->reorderCodes != NULL) {
  1.2445 +        uprv_free(src->reorderCodes);
  1.2446 +    }
  1.2447 +}
  1.2448 +
  1.2449 +#endif /* #if !UCONFIG_NO_COLLATION */
The Tor Browser / file diff

diff: intl/icu/source/i18n/ucol_tok.cpp

intl/icu/source/i18n/ucol_tok.cpp