intl/icu/source/i18n/ucol_tok.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2001-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucol_tok.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created 02/22/2001
    14 *   created by: Vladimir Weinstein
    15 *
    16 * This module reads a tailoring rule string and produces a list of
    17 * tokens that will be turned into collation elements
    18 *
    19 */
    21 #include "unicode/utypes.h"
    23 #if !UCONFIG_NO_COLLATION
    25 #include "unicode/uscript.h"
    26 #include "unicode/ustring.h"
    27 #include "unicode/uchar.h"
    28 #include "unicode/uniset.h"
    30 #include "cmemory.h"
    31 #include "cstring.h"
    32 #include "patternprops.h"
    33 #include "ucol_bld.h"
    34 #include "ucol_tok.h"
    35 #include "ulocimp.h"
    36 #include "uresimp.h"
    38 // Define this only for debugging.
    39 // #define DEBUG_FOR_COLL_RULES 1
    41 #ifdef DEBUG_FOR_COLL_RULES
    42 #include <iostream>
    43 #endif
    45 U_NAMESPACE_USE
    47 U_CDECL_BEGIN
    48 static int32_t U_CALLCONV
    49 uhash_hashTokens(const UHashTok k)
    50 {
    51     int32_t hash = 0;
    52     //uint32_t key = (uint32_t)k.integer;
    53     UColToken *key = (UColToken *)k.pointer;
    54     if (key != 0) {
    55         int32_t len = (key->source & 0xFF000000)>>24;
    56         int32_t inc = ((len - 32) / 32) + 1;
    58         const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
    59         const UChar *limit = p + len;
    61         while (p<limit) {
    62             hash = (hash * 37) + *p;
    63             p += inc;
    64         }
    65     }
    66     return hash;
    67 }
    69 static UBool U_CALLCONV
    70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
    71 {
    72     //uint32_t p1 = (uint32_t) key1.integer;
    73     //uint32_t p2 = (uint32_t) key2.integer;
    74     UColToken *p1 = (UColToken *)key1.pointer;
    75     UColToken *p2 = (UColToken *)key2.pointer;
    76     const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
    77     const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
    78     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
    79     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
    80     const UChar *end = s1+s1L-1;
    82     if (p1 == p2) {
    83         return TRUE;
    84     }
    85     if (p1->source == 0 || p2->source == 0) {
    86         return FALSE;
    87     }
    88     if(s1L != s2L) {
    89         return FALSE;
    90     }
    91     if(p1->source == p2->source) {
    92         return TRUE;
    93     }
    94     while((s1 < end) && *s1 == *s2) {
    95         ++s1;
    96         ++s2;
    97     }
    98     if(*s1 == *s2) {
    99         return TRUE;
   100     } else {
   101         return FALSE;
   102     }
   103 }
   104 U_CDECL_END
   106 /*
   107  * Debug messages used to pinpoint where a format error occurred.
   108  * A better way is to include context-sensitive information in syntaxError() function.
   109  *
   110  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
   111  * in the compile line.
   112  */
   113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
   115 #ifdef DEBUG_FOR_FORMAT_ERROR
   116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
   117 #else
   118 #define DBG_FORMAT_ERROR
   119 #endif
   122 /*
   123  * Controls debug messages so that the output can be compared before and after a
   124  * big change.  Prints the information of every code point that comes out of the
   125  * collation parser and its strength into a file.  When a big change in format
   126  * happens, the files before and after the change should be identical.
   127  *
   128  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
   129  * in the compile line.
   130  */
   131 // #define DEBUG_FOR_CODE_POINTS 1
   133 #ifdef DEBUG_FOR_CODE_POINTS
   134     FILE* dfcp_fp = NULL;
   135 #endif
   138 typedef struct {
   139     uint32_t startCE;
   140     uint32_t startContCE;
   141     uint32_t limitCE;
   142     uint32_t limitContCE;
   143 } indirectBoundaries;
   145 /* these values are used for finding CE values for indirect positioning. */
   146 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
   147 /* values. It only works for resets and you cannot tailor indirect names */
   148 /* An indirect name can define either an anchor point or a range. An     */
   149 /* anchor point behaves in exactly the same way as a code point in reset */
   150 /* would, except that it cannot be tailored. A range (we currently only  */
   151 /* know for the [top] range will explicitly set the upper bound for      */
   152 /* generated CEs, thus allowing for better control over how many CEs can */
   153 /* be squeezed between in the range without performance penalty.         */
   154 /* In that respect, we use [top] for tailoring of locales that use CJK   */
   155 /* characters. Other indirect values are currently a pure convenience,   */
   156 /* they can be used to assure that the CEs will be always positioned in  */
   157 /* the same place relative to a point with known properties (e.g. first  */
   158 /* primary ignorable). */
   159 static indirectBoundaries ucolIndirectBoundaries[15];
   160 /*
   161 static indirectBoundaries ucolIndirectBoundaries[11] = {
   162 { UCOL_RESET_TOP_VALUE,               0,
   163 UCOL_NEXT_TOP_VALUE,                0 },
   164 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
   165 0,                                  0 },
   166 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
   167 0,                                  0 },
   168 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
   169 0,                                  0 },
   170 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
   171 0,                                  0 },
   172 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
   173 0,                                  0 },
   174 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
   175 0,                                  0 },
   176 { UCOL_FIRST_VARIABLE,                0,
   177 0,                                  0 },
   178 { UCOL_LAST_VARIABLE,                 0,
   179 0,                                  0 },
   180 { UCOL_FIRST_NON_VARIABLE,            0,
   181 0,                                  0 },
   182 { UCOL_LAST_NON_VARIABLE,             0,
   183 0,                                  0 },
   184 };
   185 */
   187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
   189     // Set values for the top - TODO: once we have values for all the indirects, we are going
   190     // to initalize here.
   191     ucolIndirectBoundaries[indexR].startCE = start[0];
   192     ucolIndirectBoundaries[indexR].startContCE = start[1];
   193     if(end) {
   194         ucolIndirectBoundaries[indexR].limitCE = end[0];
   195         ucolIndirectBoundaries[indexR].limitContCE = end[1];
   196     } else {
   197         ucolIndirectBoundaries[indexR].limitCE = 0;
   198         ucolIndirectBoundaries[indexR].limitContCE = 0;
   199     }
   200 }
   203 static inline
   204 void syntaxError(const UChar* rules,
   205                  int32_t pos,
   206                  int32_t rulesLen,
   207                  UParseError* parseError)
   208 {
   209     parseError->offset = pos;
   210     parseError->line = 0 ; /* we are not using line numbers */
   212     // for pre-context
   213     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
   214     int32_t stop  = pos;
   216     u_memcpy(parseError->preContext,rules+start,stop-start);
   217     //null terminate the buffer
   218     parseError->preContext[stop-start] = 0;
   220     //for post-context
   221     start = pos+1;
   222     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
   223     rulesLen;
   225     if(start < stop) {
   226         u_memcpy(parseError->postContext,rules+start,stop-start);
   227         //null terminate the buffer
   228         parseError->postContext[stop-start]= 0;
   229     } else {
   230         parseError->postContext[0] = 0;
   231     }
   232 }
   234 static
   235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
   236     switch(attrib) {
   237     case UCOL_HIRAGANA_QUATERNARY_MODE:
   238         opts->hiraganaQ = value;
   239         break;
   240     case UCOL_FRENCH_COLLATION:
   241         opts->frenchCollation = value;
   242         break;
   243     case UCOL_ALTERNATE_HANDLING:
   244         opts->alternateHandling = value;
   245         break;
   246     case UCOL_CASE_FIRST:
   247         opts->caseFirst = value;
   248         break;
   249     case UCOL_CASE_LEVEL:
   250         opts->caseLevel = value;
   251         break;
   252     case UCOL_NORMALIZATION_MODE:
   253         opts->normalizationMode = value;
   254         break;
   255     case UCOL_STRENGTH:
   256         opts->strength = value;
   257         break;
   258     case UCOL_NUMERIC_COLLATION:
   259         opts->numericCollation = value;
   260         break;
   261     case UCOL_ATTRIBUTE_COUNT:
   262     default:
   263         break;
   264     }
   265 }
   267 #define UTOK_OPTION_COUNT 22
   269 static UBool didInit = FALSE;
   270 /* we can be strict, or we can be lenient */
   271 /* I'd surely be lenient with the option arguments */
   272 /* maybe even with options */
   273 U_STRING_DECL(suboption_00, "non-ignorable", 13);
   274 U_STRING_DECL(suboption_01, "shifted",        7);
   276 U_STRING_DECL(suboption_02, "lower",          5);
   277 U_STRING_DECL(suboption_03, "upper",          5);
   278 U_STRING_DECL(suboption_04, "off",            3);
   279 U_STRING_DECL(suboption_05, "on",             2);
   280 U_STRING_DECL(suboption_06, "1",              1);
   281 U_STRING_DECL(suboption_07, "2",              1);
   282 U_STRING_DECL(suboption_08, "3",              1);
   283 U_STRING_DECL(suboption_09, "4",              1);
   284 U_STRING_DECL(suboption_10, "I",              1);
   286 U_STRING_DECL(suboption_11, "primary",        7);
   287 U_STRING_DECL(suboption_12, "secondary",      9);
   288 U_STRING_DECL(suboption_13, "tertiary",       8);
   289 U_STRING_DECL(suboption_14, "variable",       8);
   290 U_STRING_DECL(suboption_15, "regular",        7);
   291 U_STRING_DECL(suboption_16, "implicit",       8);
   292 U_STRING_DECL(suboption_17, "trailing",       8);
   295 U_STRING_DECL(option_00,    "undefined",      9);
   296 U_STRING_DECL(option_01,    "rearrange",      9);
   297 U_STRING_DECL(option_02,    "alternate",      9);
   298 U_STRING_DECL(option_03,    "backwards",      9);
   299 U_STRING_DECL(option_04,    "variable top",  12);
   300 U_STRING_DECL(option_05,    "top",            3);
   301 U_STRING_DECL(option_06,    "normalization", 13);
   302 U_STRING_DECL(option_07,    "caseLevel",      9);
   303 U_STRING_DECL(option_08,    "caseFirst",      9);
   304 U_STRING_DECL(option_09,    "scriptOrder",   11);
   305 U_STRING_DECL(option_10,    "charsetname",   11);
   306 U_STRING_DECL(option_11,    "charset",        7);
   307 U_STRING_DECL(option_12,    "before",         6);
   308 U_STRING_DECL(option_13,    "hiraganaQ",      9);
   309 U_STRING_DECL(option_14,    "strength",       8);
   310 U_STRING_DECL(option_15,    "first",          5);
   311 U_STRING_DECL(option_16,    "last",           4);
   312 U_STRING_DECL(option_17,    "optimize",       8);
   313 U_STRING_DECL(option_18,    "suppressContractions",         20);
   314 U_STRING_DECL(option_19,    "numericOrdering",              15);
   315 U_STRING_DECL(option_20,    "import",         6);
   316 U_STRING_DECL(option_21,    "reorder",         7);
   318 /*
   319 [last variable] last variable value
   320 [last primary ignorable] largest CE for primary ignorable
   321 [last secondary ignorable] largest CE for secondary ignorable
   322 [last tertiary ignorable] largest CE for tertiary ignorable
   323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
   324 */
   327 static const ucolTokSuboption alternateSub[2] = {
   328     {suboption_00, 13, UCOL_NON_IGNORABLE},
   329     {suboption_01,  7, UCOL_SHIFTED}
   330 };
   332 static const ucolTokSuboption caseFirstSub[3] = {
   333     {suboption_02, 5, UCOL_LOWER_FIRST},
   334     {suboption_03,  5, UCOL_UPPER_FIRST},
   335     {suboption_04,  3, UCOL_OFF},
   336 };
   338 static const ucolTokSuboption onOffSub[2] = {
   339     {suboption_04, 3, UCOL_OFF},
   340     {suboption_05, 2, UCOL_ON}
   341 };
   343 static const ucolTokSuboption frenchSub[1] = {
   344     {suboption_07, 1, UCOL_ON}
   345 };
   347 static const ucolTokSuboption beforeSub[3] = {
   348     {suboption_06, 1, UCOL_PRIMARY},
   349     {suboption_07, 1, UCOL_SECONDARY},
   350     {suboption_08, 1, UCOL_TERTIARY}
   351 };
   353 static const ucolTokSuboption strengthSub[5] = {
   354     {suboption_06, 1, UCOL_PRIMARY},
   355     {suboption_07, 1, UCOL_SECONDARY},
   356     {suboption_08, 1, UCOL_TERTIARY},
   357     {suboption_09, 1, UCOL_QUATERNARY},
   358     {suboption_10, 1, UCOL_IDENTICAL},
   359 };
   361 static const ucolTokSuboption firstLastSub[7] = {
   362     {suboption_11, 7, UCOL_PRIMARY},
   363     {suboption_12, 9, UCOL_PRIMARY},
   364     {suboption_13, 8, UCOL_PRIMARY},
   365     {suboption_14, 8, UCOL_PRIMARY},
   366     {suboption_15, 7, UCOL_PRIMARY},
   367     {suboption_16, 8, UCOL_PRIMARY},
   368     {suboption_17, 8, UCOL_PRIMARY},
   369 };
   371 enum OptionNumber {
   372     OPTION_ALTERNATE_HANDLING = 0,
   373     OPTION_FRENCH_COLLATION,
   374     OPTION_CASE_LEVEL,
   375     OPTION_CASE_FIRST,
   376     OPTION_NORMALIZATION_MODE,
   377     OPTION_HIRAGANA_QUATERNARY,
   378     OPTION_STRENGTH,
   379     OPTION_NUMERIC_COLLATION,
   380     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
   381     OPTION_VARIABLE_TOP,
   382     OPTION_REARRANGE,
   383     OPTION_BEFORE,
   384     OPTION_TOP,
   385     OPTION_FIRST,
   386     OPTION_LAST,
   387     OPTION_OPTIMIZE,
   388     OPTION_SUPPRESS_CONTRACTIONS,
   389     OPTION_UNDEFINED,
   390     OPTION_SCRIPT_ORDER,
   391     OPTION_CHARSET_NAME,
   392     OPTION_CHARSET,
   393     OPTION_IMPORT,
   394     OPTION_SCRIPTREORDER
   395 } ;
   397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
   398     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
   399     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
   400     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
   401     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
   402     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
   403     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
   404     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
   405     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
   406     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
   407     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
   408     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
   409     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
   410     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
   411     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
   412     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
   413     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
   414     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
   415     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
   416     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
   417     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
   418     /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
   419     /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
   420 };
   422 static
   423 int32_t u_strncmpNoCase(const UChar     *s1,
   424                         const UChar     *s2,
   425                         int32_t     n)
   426 {
   427     if(n > 0) {
   428         int32_t rc;
   429         for(;;) {
   430             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
   431             if(rc != 0 || *s1 == 0 || --n == 0) {
   432                 return rc;
   433             }
   434             ++s1;
   435             ++s2;
   436         }
   437     }
   438     return 0;
   439 }
   441 static
   442 void ucol_uprv_tok_initData() {
   443     if(!didInit) {
   444         U_STRING_INIT(suboption_00, "non-ignorable", 13);
   445         U_STRING_INIT(suboption_01, "shifted",        7);
   447         U_STRING_INIT(suboption_02, "lower",          5);
   448         U_STRING_INIT(suboption_03, "upper",          5);
   449         U_STRING_INIT(suboption_04, "off",            3);
   450         U_STRING_INIT(suboption_05, "on",             2);
   452         U_STRING_INIT(suboption_06, "1",              1);
   453         U_STRING_INIT(suboption_07, "2",              1);
   454         U_STRING_INIT(suboption_08, "3",              1);
   455         U_STRING_INIT(suboption_09, "4",              1);
   456         U_STRING_INIT(suboption_10, "I",              1);
   458         U_STRING_INIT(suboption_11, "primary",        7);
   459         U_STRING_INIT(suboption_12, "secondary",      9);
   460         U_STRING_INIT(suboption_13, "tertiary",       8);
   461         U_STRING_INIT(suboption_14, "variable",       8);
   462         U_STRING_INIT(suboption_15, "regular",        7);
   463         U_STRING_INIT(suboption_16, "implicit",       8);
   464         U_STRING_INIT(suboption_17, "trailing",       8);
   467         U_STRING_INIT(option_00, "undefined",      9);
   468         U_STRING_INIT(option_01, "rearrange",      9);
   469         U_STRING_INIT(option_02, "alternate",      9);
   470         U_STRING_INIT(option_03, "backwards",      9);
   471         U_STRING_INIT(option_04, "variable top",  12);
   472         U_STRING_INIT(option_05, "top",            3);
   473         U_STRING_INIT(option_06, "normalization", 13);
   474         U_STRING_INIT(option_07, "caseLevel",      9);
   475         U_STRING_INIT(option_08, "caseFirst",      9);
   476         U_STRING_INIT(option_09, "scriptOrder",   11);
   477         U_STRING_INIT(option_10, "charsetname",   11);
   478         U_STRING_INIT(option_11, "charset",        7);
   479         U_STRING_INIT(option_12, "before",         6);
   480         U_STRING_INIT(option_13, "hiraganaQ",      9);
   481         U_STRING_INIT(option_14, "strength",       8);
   482         U_STRING_INIT(option_15, "first",          5);
   483         U_STRING_INIT(option_16, "last",           4);
   484         U_STRING_INIT(option_17, "optimize",       8);
   485         U_STRING_INIT(option_18, "suppressContractions",         20);
   486         U_STRING_INIT(option_19, "numericOrdering",      15);
   487         U_STRING_INIT(option_20, "import ",        6);
   488         U_STRING_INIT(option_21, "reorder",        7);
   489         didInit = TRUE;
   490     }
   491 }
   494 // This function reads basic options to set in the runtime collator
   495 // used by data driven tests. Should not support build time options
   496 U_CAPI const UChar * U_EXPORT2
   497 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
   498                          UColAttribute *attrib, UColAttributeValue *value,
   499                          UErrorCode *status)
   500 {
   501     uint32_t i = 0;
   502     int32_t j=0;
   503     UBool foundOption = FALSE;
   504     const UChar *optionArg = NULL;
   506     ucol_uprv_tok_initData();
   508     while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
   509         start++;
   510     }
   511     if(start >= end) {
   512         return NULL;
   513     }
   514     /* skip opening '[' */
   515     if(*start == 0x005b) {
   516         start++;
   517     } else {
   518         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
   519         return NULL;
   520     }
   522     while(i < UTOK_OPTION_COUNT) {
   523         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
   524             foundOption = TRUE;
   525             if(end - start > rulesOptions[i].optionLen) {
   526                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
   527                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
   528                     optionArg++;
   529                 }
   530             }
   531             break;
   532         }
   533         i++;
   534     }
   536     if(!foundOption) {
   537         *status = U_ILLEGAL_ARGUMENT_ERROR;
   538         return NULL;
   539     }
   541     if(optionArg) {
   542         for(j = 0; j<rulesOptions[i].subSize; j++) {
   543             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   544                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
   545                 *attrib = rulesOptions[i].attr;
   546                 *value = rulesOptions[i].subopts[j].attrVal;
   547                 optionArg += rulesOptions[i].subopts[j].subLen;
   548                 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
   549                     optionArg++;
   550                 }
   551                 if(*optionArg == 0x005d) {
   552                     optionArg++;
   553                     return optionArg;
   554                 } else {
   555                     *status = U_ILLEGAL_ARGUMENT_ERROR;
   556                     return NULL;
   557                 }
   558             }
   559         }
   560     }
   561     *status = U_ILLEGAL_ARGUMENT_ERROR;
   562     return NULL;
   563 }
   565 static
   566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
   567     while(*start != 0x005b) { /* advance while we find the first '[' */
   568         start++;
   569     }
   570     // now we need to get a balanced set of '[]'. The problem is that a set can have
   571     // many, and *end point to the first closing '['
   572     int32_t noOpenBraces = 1;
   573     int32_t current = 1; // skip the opening brace
   574     while(start+current < end && noOpenBraces != 0) {
   575         if(start[current] == 0x005b) {
   576             noOpenBraces++;
   577         } else if(start[current] == 0x005D) { // closing brace
   578             noOpenBraces--;
   579         }
   580         current++;
   581     }
   583     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
   584         *status = U_ILLEGAL_ARGUMENT_ERROR;
   585         return NULL;
   586     }
   587     return uset_openPattern(start, current, status);
   588 }
   590 /**
   591  * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
   592  * @param start Pointer to the start UChar.
   593  * @param end Pointer to the last valid pointer beyond which the option will not extend.
   594  * @param optionArg Address of the pointer at which the options start (after the option name)
   595  * @return The index of the option, or -1 if the option is not valid.
   596  */
   597 static
   598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
   599     int32_t i = 0;
   600     ucol_uprv_tok_initData();
   602     while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
   603         start++;
   604     }
   605     while(i < UTOK_OPTION_COUNT) {
   606         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
   607             if(end - start > rulesOptions[i].optionLen) {
   608                 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
   609                 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
   610                     (*optionArg)++;
   611                 }
   612             }
   613             break;
   614         }
   615         i++;
   616     }
   617     if(i == UTOK_OPTION_COUNT) {
   618         i = -1; // didn't find an option
   619     }
   620     return i;
   621 }
   624 static
   625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
   626     int32_t codeCount = 0;
   627     int32_t codeIndex = 0;
   628     char conversion[64];
   629     int32_t tokenLength = 0;
   630     const UChar* space;
   632     const UChar* current = src->current;
   633     const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
   635     // eat leading whitespace
   636     while(current < end && u_isWhitespace(*current)) {
   637         current++;
   638     }
   640     while(current < end) {    
   641         space = u_memchr(current, 0x0020, end - current);
   642         space = space == 0 ? end : space;
   643         tokenLength = space - current;
   644         if (tokenLength < 4) {
   645             *status = U_INVALID_FORMAT_ERROR;
   646             return;
   647         }
   648         codeCount++;
   649         current += tokenLength;
   650         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
   651             ++current;
   652         }
   653     }
   655     if (codeCount == 0) {
   656         *status = U_INVALID_FORMAT_ERROR;
   657     }
   659     src->reorderCodesLength = codeCount;
   660     src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
   661     current = src->current;
   663     // eat leading whitespace
   664     while(current < end && u_isWhitespace(*current)) {
   665         current++;
   666     }
   668     while(current < end) {    
   669         space = u_memchr(current, 0x0020, end - current);
   670         space = space == 0 ? end : space;
   671         tokenLength = space - current;
   672         if (tokenLength < 4) {
   673             *status = U_ILLEGAL_ARGUMENT_ERROR;
   674             return;
   675         } else {
   676             u_UCharsToChars(current, conversion, tokenLength);
   677             conversion[tokenLength] = '\0';
   678             src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
   679             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
   680                 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
   681             }
   682             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
   683                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   684             }
   685         }
   686         codeIndex++;
   687         current += tokenLength;
   688         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
   689             ++current;
   690         }
   691     }
   692 }
   694 // reads and conforms to various options in rules
   695 // end is the position of the first closing ']'
   696 // However, some of the options take an UnicodeSet definition
   697 // which needs to duplicate the closing ']'
   698 // for example: '[copy [\uAC00-\uD7FF]]'
   699 // These options will move end to the second ']' and the
   700 // caller will set the current to it.
   701 static
   702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
   703     const UChar* start = src->current;
   704     int32_t i = 0;
   705     int32_t j=0;
   706     const UChar *optionArg = NULL;
   708     uint8_t result = 0;
   710     start++; /*skip opening '['*/
   711     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
   712     if(optionArg) {
   713         src->current = optionArg;
   714     }
   716     if(i < 0) {
   717         *status = U_ILLEGAL_ARGUMENT_ERROR;
   718     } else {
   719         int32_t noOpenBraces = 1;
   720         switch(i) {
   721     case OPTION_ALTERNATE_HANDLING:
   722     case OPTION_FRENCH_COLLATION:
   723     case OPTION_CASE_LEVEL:
   724     case OPTION_CASE_FIRST:
   725     case OPTION_NORMALIZATION_MODE:
   726     case OPTION_HIRAGANA_QUATERNARY:
   727     case OPTION_STRENGTH:
   728     case OPTION_NUMERIC_COLLATION:
   729         if(optionArg) {
   730             for(j = 0; j<rulesOptions[i].subSize; j++) {
   731                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   732                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
   733                     result =  UCOL_TOK_SUCCESS;
   734                 }
   735             }
   736         }
   737         if(result == 0) {
   738             *status = U_ILLEGAL_ARGUMENT_ERROR;
   739         }
   740         break;
   741     case OPTION_VARIABLE_TOP:
   742         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
   743         break;
   744     case OPTION_REARRANGE:
   745         result = UCOL_TOK_SUCCESS;
   746         break;
   747     case OPTION_BEFORE:
   748         if(optionArg) {
   749             for(j = 0; j<rulesOptions[i].subSize; j++) {
   750                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   751                     result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
   752                 }
   753             }
   754         }
   755         if(result == 0) {
   756             *status = U_ILLEGAL_ARGUMENT_ERROR;
   757         }
   758         break;
   759     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
   760         /* index to this array will be src->parsedToken.indirectIndex*/
   761         src->parsedToken.indirectIndex = 0;
   762         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
   763         break;
   764     case OPTION_FIRST:
   765     case OPTION_LAST: /* first, last */
   766         for(j = 0; j<rulesOptions[i].subSize; j++) {
   767             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
   768                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
   769                 // element of indirect boundaries is reserved for top.
   770                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
   771                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
   772             }
   773         }
   774         if(result == 0) {
   775             *status = U_ILLEGAL_ARGUMENT_ERROR;
   776         }
   777         break;
   778     case OPTION_OPTIMIZE:
   779     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
   780         // we need to move end here
   781         src->current++; // skip opening brace
   782         while(src->current < src->end && noOpenBraces != 0) {
   783             if(*src->current == 0x005b) {
   784                 noOpenBraces++;
   785             } else if(*src->current == 0x005D) { // closing brace
   786                 noOpenBraces--;
   787             }
   788             src->current++;
   789         }
   790         result = UCOL_TOK_SUCCESS;
   791         break;
   792     case OPTION_SCRIPTREORDER:
   793         ucol_tok_parseScriptReorder(src, status);
   794         break;
   795     default:
   796         *status = U_UNSUPPORTED_ERROR;
   797         break;
   798         }
   799     }
   800     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
   801     return result;
   802 }
   805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
   806     if (stuff == NULL || len <= 0) {
   807         return;
   808     }
   809     UnicodeString tempStuff(FALSE, stuff, len);
   810     if(src->extraCurrent+len >= src->extraEnd) {
   811         /* reallocate */
   812         if (stuff >= src->source && stuff <= src->end) {
   813             // Copy the "stuff" contents into tempStuff's own buffer.
   814             // UnicodeString is copy-on-write.
   815             if (len > 0) {
   816                 tempStuff.setCharAt(0, tempStuff[0]);
   817             } else {
   818                 tempStuff.remove();
   819             }
   820         }
   821         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
   822         if(newSrc != NULL) {
   823             src->current = newSrc + (src->current - src->source);
   824             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
   825             src->end = newSrc + (src->end - src->source);
   826             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
   827             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
   828             src->source = newSrc;
   829         } else {
   830             *status = U_MEMORY_ALLOCATION_ERROR;
   831             return;
   832         }
   833     }
   834     if(len == 1) {
   835         *src->extraCurrent++ = tempStuff[0];
   836     } else {
   837         u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
   838         src->extraCurrent += len;
   839     }
   840 }
   842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
   843     /*
   844     top = TRUE;
   845     */
   846     UChar buff[5];
   847     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   848     buff[0] = 0xFFFE;
   849     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
   850     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
   851     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
   852         src->parsedToken.charsLen = 3;
   853         ucol_tok_addToExtraCurrent(src, buff, 3, status);
   854     } else {
   855         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
   856         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
   857         src->parsedToken.charsLen = 5;
   858         ucol_tok_addToExtraCurrent(src, buff, 5, status);
   859     }
   860     return TRUE;
   861 }
   863 static UBool isCharNewLine(UChar c){
   864     switch(c){
   865     case 0x000A: /* LF  */
   866     case 0x000D: /* CR  */
   867     case 0x000C: /* FF  */
   868     case 0x0085: /* NEL */
   869     case 0x2028: /* LS  */
   870     case 0x2029: /* PS  */
   871         return TRUE;
   872     default:
   873         return FALSE;
   874     }
   875 }
   877 /*
   878  * This function is called several times when a range is processed.  Each time, the next code point
   879  * is processed.
   880  * The following variables must be set before calling this function:
   881  *   src->currentRangeCp:  The current code point to process.
   882  *   src->lastRangeCp: The last code point in the range.
   883  * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
   884  */
   885 static const UChar*
   886 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
   887                                      UErrorCode *status)
   888 {
   889   // Append current code point to source
   890   UChar buff[U16_MAX_LENGTH];
   891   uint32_t i = 0;
   893   uint32_t nChars = U16_LENGTH(src->currentRangeCp);
   894   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
   895   src->parsedToken.charsLen = nChars;
   897   U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
   898   ucol_tok_addToExtraCurrent(src, buff, nChars, status);
   900   ++src->currentRangeCp;
   901   if (src->currentRangeCp > src->lastRangeCp) {
   902     src->inRange = FALSE;
   904     if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
   905       src->isStarred = FALSE;
   906     }
   907   } else {
   908     src->previousCp = src->currentRangeCp;
   909   }
   910   return src->current;
   911 }
   913 /*
   914  * This function is called several times when a starred list is processed.  Each time, the next code point
   915  * in the list is processed.
   916  * The following variables must be set before calling this function:
   917  *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
   918  *   src->lastStarredCharIndex: Index to the last character in the list.
   919  * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
   920  */
   921 static const UChar*
   922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
   923 {
   924   // Extract the characters corresponding to the next code point.
   925   UChar32 cp;
   926   src->parsedToken.charsOffset = src->currentStarredCharIndex;
   927   int32_t prev = src->currentStarredCharIndex;
   928   U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
   929   src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
   931   // When we are done parsing the starred string, turn the flag off so that
   932   // the normal processing is restored.
   933   if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
   934     src->isStarred = FALSE;
   935   }
   936   src->previousCp = cp;
   937   return src->current;
   938 }
   940 /*
   941  * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
   942  *
   943  * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
   944  *  # : Comment character
   945  *  & : Reset operator
   946  *  = : Equality
   947  *  < : Primary collation
   948  *  << : Secondary collation
   949  *  <<< : Tertiary collation
   950  *  ; : Secondary collation
   951  *  , : Tertiary collation
   952  *  / : Expansions
   953  *  | : Prefix
   954  *  - : Range
   956  *  ! : Java Thai modifier, ignored
   957  *  @ : French only
   959  * [] : Options
   960  * '' : Quotes
   961  *
   962  *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
   963  *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
   964  *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
   965  *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
   966  *  character returned as cached so that the calling program can do further splitting.
   967  */
   968 static const UChar*
   969 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
   970                                 UBool startOfRules,
   971                                 UParseError *parseError,
   972                                 UErrorCode *status)
   973 {
   974     UBool variableTop = FALSE;
   975     UBool top = FALSE;
   976     UBool inChars = TRUE;
   977     UBool inQuote = FALSE;
   978     UBool wasInQuote = FALSE;
   979     uint8_t before = 0;
   980     UBool isEscaped = FALSE;
   982     // TODO: replace these variables with src->parsedToken counterparts
   983     // no need to use them anymore since we have src->parsedToken.
   984     // Ideally, token parser would be a nice class... Once, when I have
   985     // more time (around 2020 probably).
   986     uint32_t newExtensionLen = 0;
   987     uint32_t extensionOffset = 0;
   988     uint32_t newStrength = UCOL_TOK_UNSET;
   989     UChar buff[10];
   991     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
   992     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
   993     src->parsedToken.indirectIndex = 0;
   995     while (src->current < src->end) {
   996         UChar ch = *(src->current);
   998         if (inQuote) {
   999             if (ch == 0x0027/*'\''*/) {
  1000                 inQuote = FALSE;
  1001             } else {
  1002                 if ((src->parsedToken.charsLen == 0) || inChars) {
  1003                     if(src->parsedToken.charsLen == 0) {
  1004                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1006                     src->parsedToken.charsLen++;
  1007                 } else {
  1008                     if(newExtensionLen == 0) {
  1009                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
  1011                     newExtensionLen++;
  1014         }else if(isEscaped){
  1015             isEscaped =FALSE;
  1016             if (newStrength == UCOL_TOK_UNSET) {
  1017                 *status = U_INVALID_FORMAT_ERROR;
  1018                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1019                 DBG_FORMAT_ERROR
  1020                 return NULL;
  1021                 // enabling rules to start with non-tokens a < b
  1022                 // newStrength = UCOL_TOK_RESET;
  1024             if(ch != 0x0000  && src->current != src->end) {
  1025                 if (inChars) {
  1026                     if(src->parsedToken.charsLen == 0) {
  1027                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
  1029                     src->parsedToken.charsLen++;
  1030                 } else {
  1031                     if(newExtensionLen == 0) {
  1032                         extensionOffset = (uint32_t)(src->current - src->source);
  1034                     newExtensionLen++;
  1037         }else {
  1038             if(!PatternProps::isWhiteSpace(ch)) {
  1039                 /* Sets the strength for this entry */
  1040                 switch (ch) {
  1041                 case 0x003D/*'='*/ :
  1042                     if (newStrength != UCOL_TOK_UNSET) {
  1043                         goto EndOfLoop;
  1046                     /* if we start with strength, we'll reset to top */
  1047                     if(startOfRules == TRUE) {
  1048                         src->parsedToken.indirectIndex = 5;
  1049                         top = ucol_tok_doSetTop(src, status);
  1050                         newStrength = UCOL_TOK_RESET;
  1051                         goto EndOfLoop;
  1053                     newStrength = UCOL_IDENTICAL;
  1054                     if(*(src->current+1) == 0x002A) {/*'*'*/
  1055                         src->current++;
  1056                         src->isStarred = TRUE;
  1058                     break;
  1060                 case 0x002C/*','*/:
  1061                     if (newStrength != UCOL_TOK_UNSET) {
  1062                         goto EndOfLoop;
  1065                     /* if we start with strength, we'll reset to top */
  1066                     if(startOfRules == TRUE) {
  1067                         src->parsedToken.indirectIndex = 5;
  1068                         top = ucol_tok_doSetTop(src, status);
  1069                         newStrength = UCOL_TOK_RESET;
  1070                         goto EndOfLoop;
  1072                     newStrength = UCOL_TERTIARY;
  1073                     break;
  1075                 case  0x003B/*';'*/:
  1076                     if (newStrength != UCOL_TOK_UNSET) {
  1077                         goto EndOfLoop;
  1080                     /* if we start with strength, we'll reset to top */
  1081                     if(startOfRules == TRUE) {
  1082                         src->parsedToken.indirectIndex = 5;
  1083                         top = ucol_tok_doSetTop(src, status);
  1084                         newStrength = UCOL_TOK_RESET;
  1085                         goto EndOfLoop;
  1087                     newStrength = UCOL_SECONDARY;
  1088                     break;
  1090                 case 0x003C/*'<'*/:
  1091                     if (newStrength != UCOL_TOK_UNSET) {
  1092                         goto EndOfLoop;
  1095                     /* if we start with strength, we'll reset to top */
  1096                     if(startOfRules == TRUE) {
  1097                         src->parsedToken.indirectIndex = 5;
  1098                         top = ucol_tok_doSetTop(src, status);
  1099                         newStrength = UCOL_TOK_RESET;
  1100                         goto EndOfLoop;
  1102                     /* before this, do a scan to verify whether this is */
  1103                     /* another strength */
  1104                     if(*(src->current+1) == 0x003C) {
  1105                         src->current++;
  1106                         if(*(src->current+1) == 0x003C) {
  1107                             src->current++; /* three in a row! */
  1108                             newStrength = UCOL_TERTIARY;
  1109                         } else { /* two in a row */
  1110                             newStrength = UCOL_SECONDARY;
  1112                     } else { /* just one */
  1113                         newStrength = UCOL_PRIMARY;
  1115                     if(*(src->current+1) == 0x002A) {/*'*'*/
  1116                         src->current++;
  1117                         src->isStarred = TRUE;
  1119                     break;
  1121                 case 0x0026/*'&'*/:
  1122                     if (newStrength != UCOL_TOK_UNSET) {
  1123                         /**/
  1124                         goto EndOfLoop;
  1127                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
  1128                     break;
  1130                 case 0x005b/*'['*/:
  1131                     /* options - read an option, analyze it */
  1132                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
  1133                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
  1134                         if(U_SUCCESS(*status)) {
  1135                             if(result & UCOL_TOK_TOP) {
  1136                                 if(newStrength == UCOL_TOK_RESET) {
  1137                                     top = ucol_tok_doSetTop(src, status);
  1138                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
  1139                                         src->parsedToken.charsLen+=2;
  1140                                         buff[0] = 0x002d;
  1141                                         buff[1] = before;
  1142                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
  1145                                     src->current++;
  1146                                     goto EndOfLoop;
  1147                                 } else {
  1148                                     *status = U_INVALID_FORMAT_ERROR;
  1149                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1150                                     DBG_FORMAT_ERROR
  1152                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
  1153                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
  1154                                     variableTop = TRUE;
  1155                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1156                                     src->parsedToken.charsLen = 1;
  1157                                     buff[0] = 0xFFFF;
  1158                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
  1159                                     src->current++;
  1160                                     goto EndOfLoop;
  1161                                 } else {
  1162                                     *status = U_INVALID_FORMAT_ERROR;
  1163                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1164                                     DBG_FORMAT_ERROR
  1166                             } else if (result & UCOL_TOK_BEFORE){
  1167                                 if(newStrength == UCOL_TOK_RESET) {
  1168                                     before = result & UCOL_TOK_BEFORE;
  1169                                 } else {
  1170                                     *status = U_INVALID_FORMAT_ERROR;
  1171                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1172                                     DBG_FORMAT_ERROR
  1175                         } else {
  1176                             *status = U_INVALID_FORMAT_ERROR;
  1177                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1178                             DBG_FORMAT_ERROR
  1179                             return NULL;
  1182                     break;
  1183                 case 0x0021/*! skip java thai modifier reordering*/:
  1184                     break;
  1185                 case 0x002F/*'/'*/:
  1186                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
  1187                     inChars = FALSE; /* we're now processing expansion */
  1188                     break;
  1189                 case 0x005C /* back slash for escaped chars */:
  1190                     isEscaped = TRUE;
  1191                     break;
  1192                     /* found a quote, we're gonna start copying */
  1193                 case 0x0027/*'\''*/:
  1194                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
  1195                       *status = U_INVALID_FORMAT_ERROR;
  1196                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1197                       DBG_FORMAT_ERROR
  1198                       return NULL;
  1199                       // enabling rules to start with a non-token character a < b
  1200                       // newStrength = UCOL_TOK_RESET;
  1203                     inQuote = TRUE;
  1205                     if(inChars) { /* we're doing characters */
  1206                         if(wasInQuote == FALSE) {
  1207                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1209                         if (src->parsedToken.charsLen != 0) {
  1210                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
  1212                         src->parsedToken.charsLen++;
  1213                     } else { /* we're doing an expansion */
  1214                         if(wasInQuote == FALSE) {
  1215                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
  1217                         if (newExtensionLen != 0) {
  1218                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
  1220                         newExtensionLen++;
  1223                     wasInQuote = TRUE;
  1225                     ch = *(++(src->current));
  1226                     if(ch == 0x0027) { /* copy the double quote */
  1227                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
  1228                         inQuote = FALSE;
  1230                     break;
  1232                     /* '@' is french only if the strength is not currently set */
  1233                     /* if it is, it's just a regular character in collation rules */
  1234                 case 0x0040/*'@'*/:
  1235                     if (newStrength == UCOL_TOK_UNSET) {
  1236                         src->opts->frenchCollation = UCOL_ON;
  1237                         break;
  1240                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
  1241                     // we want to store read characters to the prefix part and continue reading
  1242                     // the characters (proper way would be to restart reading the chars, but in
  1243                     // that case we would have to complicate the token hasher, which I do not
  1244                     // intend to play with. Instead, we will do prefixes when prefixes are due
  1245                     // (before adding the elements).
  1246                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
  1247                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
  1249                     if(inChars) { /* we're doing characters */
  1250                         if(wasInQuote == FALSE) {
  1251                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1253                         if (src->parsedToken.charsLen != 0) {
  1254                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
  1256                         src->parsedToken.charsLen++;
  1259                     wasInQuote = TRUE;
  1261                     do {
  1262                         ch = *(++(src->current));
  1263                         // skip whitespace between '|' and the character
  1264                     } while (PatternProps::isWhiteSpace(ch));
  1265                     break;
  1267                     //charsOffset = 0;
  1268                     //newCharsLen = 0;
  1269                     //break; // We want to store the whole prefix/character sequence. If we break
  1270                     // the '|' is going to get lost.
  1272                 case 0x002D /*-*/: /* A range. */
  1273                     if (newStrength != UCOL_TOK_UNSET) {
  1274                       // While processing the pending token, the isStarred field
  1275                       // is reset, so it needs to be saved for the next
  1276                       // invocation.
  1277                       src->savedIsStarred = src->isStarred;
  1278                       goto EndOfLoop;
  1280                    src->isStarred = src->savedIsStarred;
  1282                    // Ranges are valid only in starred tokens.
  1283                    if (!src->isStarred) {
  1284                      *status = U_INVALID_FORMAT_ERROR;
  1285                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1286                      DBG_FORMAT_ERROR
  1287                      return NULL;
  1289                    newStrength = src->parsedToken.strength;
  1290                    src->inRange = TRUE;
  1291                    break;
  1293                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
  1294                     do {
  1295                         ch = *(++(src->current));
  1296                     } while (!isCharNewLine(ch));
  1298                     break;
  1299                 default:
  1300                     if (newStrength == UCOL_TOK_UNSET) {
  1301                       *status = U_INVALID_FORMAT_ERROR;
  1302                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1303                       DBG_FORMAT_ERROR
  1304                       return NULL;
  1307                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
  1308                         *status = U_INVALID_FORMAT_ERROR;
  1309                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1310                         DBG_FORMAT_ERROR
  1311                         return NULL;
  1314                     if(ch == 0x0000 && src->current+1 == src->end) {
  1315                         break;
  1318                     if (inChars) {
  1319                         if(src->parsedToken.charsLen == 0) {
  1320                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
  1322                         src->parsedToken.charsLen++;
  1323                     } else {
  1324                         if(newExtensionLen == 0) {
  1325                             extensionOffset = (uint32_t)(src->current - src->source);
  1327                         newExtensionLen++;
  1330                     break;
  1335         if(wasInQuote) {
  1336             if(ch != 0x27) {
  1337                 if(inQuote || !PatternProps::isWhiteSpace(ch)) {
  1338                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
  1343         src->current++;
  1346 EndOfLoop:
  1347     wasInQuote = FALSE;
  1348     if (newStrength == UCOL_TOK_UNSET) {
  1349         return NULL;
  1352     if (src->parsedToken.charsLen == 0 && top == FALSE) {
  1353         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
  1354         *status = U_INVALID_FORMAT_ERROR;
  1355         DBG_FORMAT_ERROR
  1356         return NULL;
  1359     src->parsedToken.strength = newStrength;
  1360     src->parsedToken.extensionOffset = extensionOffset;
  1361     src->parsedToken.extensionLen = newExtensionLen;
  1362     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
  1364     return src->current;
  1367 /*
  1368  * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
  1369  * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
  1371  * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
  1372  *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
  1373  *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
  1374  *     cached as member variables of the token parser.
  1375  *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
  1376  *     starting character as a single list token (which is separated into individual characters here)
  1377  *     and as another list token starting with the last character in the range.  Before expanding it
  1378  *     as a list of tokens, this function expands the range by filling the intermediate characters and
  1379  *     returns them one by one as separate tokens.
  1380  * Necessary checks are done for invalid combinations.
  1381  */
  1382 U_CAPI const UChar* U_EXPORT2
  1383 ucol_tok_parseNextToken(UColTokenParser *src,
  1384                         UBool startOfRules,
  1385                         UParseError *parseError,
  1386                         UErrorCode *status)
  1388   const UChar *nextToken;
  1390   if (src->inRange) {
  1391     // We are not done processing a range.  Continue it.
  1392     return ucol_tok_processNextCodePointInRange(src, status);
  1393   } else if (src->isStarred) {
  1394     // We are not done processing a starred token.  Continue it.
  1395     return ucol_tok_processNextTokenInStarredList(src);
  1398   // Get the next token.
  1399   nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
  1401   if (nextToken == NULL) {
  1402     return NULL;
  1405   if (src->inRange) {
  1406     // A new range has started.
  1407     // Check whether it is a chain of ranges with more than one hyphen.
  1408     if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
  1409         *status = U_INVALID_FORMAT_ERROR;
  1410         syntaxError(src->source,src->parsedToken.charsOffset-1,
  1411                     src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
  1412         DBG_FORMAT_ERROR
  1413         return NULL;
  1416     // The current token indicates the second code point of the range.
  1417     // Process just that, and then proceed with the star.
  1418     src->currentStarredCharIndex = src->parsedToken.charsOffset;
  1419     U16_NEXT(src->source, src->currentStarredCharIndex, 
  1420              (uint32_t)(src->end - src->source), src->lastRangeCp);
  1421     if (src->lastRangeCp <= src->previousCp) {
  1422         *status = U_INVALID_FORMAT_ERROR;
  1423         syntaxError(src->source,src->parsedToken.charsOffset-1,
  1424                     src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
  1425         DBG_FORMAT_ERROR
  1426         return NULL;
  1429     // Set current range code point to process the range loop
  1430     src->currentRangeCp = src->previousCp + 1;
  1432     src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
  1434     return ucol_tok_processNextCodePointInRange(src, status);
  1435  } else if (src->isStarred) {
  1436     // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
  1437     // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
  1438     // separated into several tokens and returned.
  1439     src->currentStarredCharIndex = src->parsedToken.charsOffset;
  1440     src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
  1442     return ucol_tok_processNextTokenInStarredList(src);
  1443   } else {
  1444     // Set previous codepoint
  1445     U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
  1447   return nextToken;
  1451 /*
  1452 Processing Description
  1453 1 Build a ListList. Each list has a header, which contains two lists (positive
  1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
  1455 reset may be null.
  1456 2 As you process, you keep a LAST pointer that points to the last token you
  1457 handled.
  1459 */
  1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
  1462                                       UParseError *parseError, UErrorCode *status)
  1464     if(src->resultLen == src->listCapacity) {
  1465         // Unfortunately, this won't work, as we store addresses of lhs in token
  1466         src->listCapacity *= 2;
  1467         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
  1468         if(src->lh == NULL) {
  1469             *status = U_MEMORY_ALLOCATION_ERROR;
  1470             return NULL;
  1473     /* do the reset thing */
  1474     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
  1475     /* test for NULL */
  1476     if (sourceToken == NULL) {
  1477         *status = U_MEMORY_ALLOCATION_ERROR;
  1478         return NULL;
  1480     sourceToken->rulesToParseHdl = &(src->source);
  1481     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1482     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
  1484     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
  1485     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
  1487     // keep the flags around so that we know about before
  1488     sourceToken->flags = src->parsedToken.flags;
  1490     if(src->parsedToken.prefixOffset != 0) {
  1491         // this is a syntax error
  1492         *status = U_INVALID_FORMAT_ERROR;
  1493         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
  1494         DBG_FORMAT_ERROR
  1495         uprv_free(sourceToken);
  1496         return 0;
  1497     } else {
  1498         sourceToken->prefix = 0;
  1501     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
  1502     sourceToken->strength = UCOL_TOK_RESET;
  1503     sourceToken->next = NULL;
  1504     sourceToken->previous = NULL;
  1505     sourceToken->noOfCEs = 0;
  1506     sourceToken->noOfExpCEs = 0;
  1507     sourceToken->listHeader = &src->lh[src->resultLen];
  1509     src->lh[src->resultLen].first = NULL;
  1510     src->lh[src->resultLen].last = NULL;
  1511     src->lh[src->resultLen].first = NULL;
  1512     src->lh[src->resultLen].last = NULL;
  1514     src->lh[src->resultLen].reset = sourceToken;
  1516     /*
  1517     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
  1518     First convert all expansions into normal form. Examples:
  1519     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
  1520     d * ... into &x * c/y * d * ...
  1521     Note: reset values can never have expansions, although they can cause the
  1522     very next item to have one. They may be contractions, if they are found
  1523     earlier in the list.
  1524     */
  1525     *expandNext = 0;
  1526     if(expand != NULL) {
  1527         /* check to see if there is an expansion */
  1528         if(src->parsedToken.charsLen > 1) {
  1529             uint32_t resetCharsOffset;
  1530             resetCharsOffset = (uint32_t)(expand - src->source);
  1531             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
  1532             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
  1536     src->resultLen++;
  1538     uhash_put(src->tailored, sourceToken, sourceToken, status);
  1540     return sourceToken;
  1543 static
  1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
  1545     if(U_FAILURE(*status)) {
  1546         return NULL;
  1548     /* this is a virgin before - we need to fish the anchor from the UCA */
  1549     collIterate s;
  1550     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
  1551     uint32_t CE, SecondCE;
  1552     // uint32_t invPos;
  1553     if(sourceToken != NULL) {
  1554         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
  1555     } else {
  1556         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
  1558     if(U_FAILURE(*status)) {
  1559         return NULL;
  1562     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
  1563     baseContCE = ucol_getNextCE(src->UCA, &s, status);
  1564     if(baseContCE == UCOL_NO_MORE_CES) {
  1565         baseContCE = 0;
  1569     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  1570     uint32_t ch = 0;
  1571     uint32_t expandNext = 0;
  1572     UColToken key;
  1574     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
  1575         uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
  1576         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
  1577         ch = uprv_uca_getCodePointFromRaw(raw-1);
  1578         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
  1579         CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
  1580         SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
  1582         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
  1583         *src->extraCurrent++ = 0xFFFE;
  1584         *src->extraCurrent++ = (UChar)ch;
  1585         src->parsedToken.charsLen++;
  1587         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
  1588         key.rulesToParseHdl = &(src->source);
  1590         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
  1591         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1593         if(sourceToken == NULL) {
  1594             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  1595             if(isContinuation(SecondCE)) {
  1596                 src->lh[src->resultLen].baseContCE = SecondCE;
  1597             } else {
  1598                 src->lh[src->resultLen].baseContCE = 0;
  1600             src->lh[src->resultLen].nextCE = 0;
  1601             src->lh[src->resultLen].nextContCE = 0;
  1602             src->lh[src->resultLen].previousCE = 0;
  1603             src->lh[src->resultLen].previousContCE = 0;
  1605             src->lh[src->resultLen].indirect = FALSE;
  1607             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1610     } else {
  1611         /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
  1613         // we got the previous CE. Now we need to see if the difference between
  1614         // the two CEs is really of the requested strength.
  1615         // if it's a bigger difference (we asked for secondary and got primary), we
  1616         // need to modify the CE.
  1617         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
  1618             // adjust the strength
  1619             // now we are in the situation where our baseCE should actually be modified in
  1620             // order to get the CE in the right position.
  1621             if(strength == UCOL_SECONDARY) {
  1622                 CE = baseCE - 0x0200;
  1623             } else { // strength == UCOL_TERTIARY
  1624                 CE = baseCE - 0x02;
  1626             if(baseContCE) {
  1627                 if(strength == UCOL_SECONDARY) {
  1628                     SecondCE = baseContCE - 0x0200;
  1629                 } else { // strength == UCOL_TERTIARY
  1630                     SecondCE = baseContCE - 0x02;
  1635 #if 0
  1636         // the code below relies on getting a code point from the inverse table, in order to be
  1637         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
  1638         // 1. There are many code points that have the same CE
  1639         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
  1640         // Also, in case when there is no equivalent strength before an element, we have to actually
  1641         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
  1642         // before a is a primary difference.
  1644         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
  1647         ch = CETable[3*invPos+2];
  1649         if((ch &  UCOL_INV_SIZEMASK) != 0) {
  1650             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
  1651             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
  1652             ch = conts[offset];
  1655         *src->extraCurrent++ = (UChar)ch;
  1656         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
  1657         src->parsedToken.charsLen = 1;
  1659         // We got an UCA before. However, this might have been tailored.
  1660         // example:
  1661         // &\u30ca = \u306a
  1662         // &[before 3]\u306a<<<\u306a|\u309d
  1665         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
  1666         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
  1667         key.rulesToParseHdl = &(src->source);
  1669         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
  1670         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1671 #endif
  1673         // here is how it should be. The situation such as &[before 1]a < x, should be
  1674         // resolved exactly as if we wrote &a > x.
  1675         // therefore, I don't really care if the UCA value before a has been changed.
  1676         // However, I do care if the strength between my element and the previous element
  1677         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
  1678         // have to construct the base CE.
  1682         // if we found a tailored thing, we have to use the UCA value and construct
  1683         // a new reset token with constructed name
  1684         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
  1685         // character to which we want to anchor is already tailored.
  1686         // We need to construct a new token which will be the anchor
  1687         // point
  1688         //*(src->extraCurrent-1) = 0xFFFE;
  1689         //*src->extraCurrent++ = (UChar)ch;
  1690         // grab before
  1691         src->parsedToken.charsOffset -= 10;
  1692         src->parsedToken.charsLen += 10;
  1693         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  1694         if(isContinuation(SecondCE)) {
  1695             src->lh[src->resultLen].baseContCE = SecondCE;
  1696         } else {
  1697             src->lh[src->resultLen].baseContCE = 0;
  1699         src->lh[src->resultLen].nextCE = 0;
  1700         src->lh[src->resultLen].nextContCE = 0;
  1701         src->lh[src->resultLen].previousCE = 0;
  1702         src->lh[src->resultLen].previousContCE = 0;
  1704         src->lh[src->resultLen].indirect = FALSE;
  1706         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  1707         //}
  1710     return sourceToken;
  1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
  1715     UColToken *lastToken = NULL;
  1716     const UChar *parseEnd = NULL;
  1717     uint32_t expandNext = 0;
  1718     UBool variableTop = FALSE;
  1719     UBool top = FALSE;
  1720     uint16_t specs = 0;
  1721     UColTokListHeader *ListList = NULL;
  1723     src->parsedToken.strength = UCOL_TOK_UNSET;
  1725     ListList = src->lh;
  1727     if(U_FAILURE(*status)) {
  1728         return 0;
  1730 #ifdef DEBUG_FOR_CODE_POINTS
  1731     char filename[35];
  1732     sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
  1733     dfcp_fp = fopen(filename, "a");
  1734     fprintf(stdout, "Output is in the file %s.\n", filename);
  1735 #endif
  1737 #ifdef DEBUG_FOR_COLL_RULES
  1738     std::string s3;
  1739     UnicodeString(src->source).toUTF8String(s3);
  1740     std::cout << "src->source = " << s3 << std::endl;
  1741 #endif
  1743     while(src->current < src->end || src->isStarred) {
  1744         src->parsedToken.prefixOffset = 0;
  1746         parseEnd = ucol_tok_parseNextToken(src,
  1747             (UBool)(lastToken == NULL),
  1748             parseError,
  1749             status);
  1751         specs = src->parsedToken.flags;
  1754         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
  1755         top = ((specs & UCOL_TOK_TOP) != 0);
  1757         if(U_SUCCESS(*status) && parseEnd != NULL) {
  1758             UColToken *sourceToken = NULL;
  1759             //uint32_t key = 0;
  1760             uint32_t lastStrength = UCOL_TOK_UNSET;
  1762             if(lastToken != NULL ) {
  1763                 lastStrength = lastToken->strength;
  1766 #ifdef DEBUG_FOR_CODE_POINTS
  1767             UChar32 cp;
  1768             U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
  1769             fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
  1770 #endif
  1771             //key = newCharsLen << 24 | charsOffset;
  1772             UColToken key;
  1773             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1774             key.rulesToParseHdl = &(src->source);
  1776             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
  1777             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1779             if(src->parsedToken.strength != UCOL_TOK_RESET) {
  1780                 if(lastToken == NULL) { /* this means that rules haven't started properly */
  1781                     *status = U_INVALID_FORMAT_ERROR;
  1782                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
  1783                     DBG_FORMAT_ERROR
  1784                     return 0;
  1786                 /*  6 Otherwise (when relation != reset) */
  1787                 if(sourceToken == NULL) {
  1788                     /* If sourceToken is null, create new one, */
  1789                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
  1790                     /* test for NULL */
  1791                     if (sourceToken == NULL) {
  1792                         *status = U_MEMORY_ALLOCATION_ERROR;
  1793                         return 0;
  1795                     sourceToken->rulesToParseHdl = &(src->source);
  1796                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
  1798                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
  1800                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
  1801                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
  1803                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
  1804                     sourceToken->next = NULL;
  1805                     sourceToken->previous = NULL;
  1806                     sourceToken->noOfCEs = 0;
  1807                     sourceToken->noOfExpCEs = 0;
  1808                     // keep the flags around so that we know about before
  1809                     sourceToken->flags = src->parsedToken.flags;
  1810                     uhash_put(src->tailored, sourceToken, sourceToken, status);
  1811                     if(U_FAILURE(*status)) {
  1812                         return 0;
  1814                 } else {
  1815                     /* we could have fished out a reset here */
  1816                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
  1817                         /* otherwise remove sourceToken from where it was. */
  1818                         if(sourceToken->next != NULL) {
  1819                             if(sourceToken->next->strength > sourceToken->strength) {
  1820                                 sourceToken->next->strength = sourceToken->strength;
  1822                             sourceToken->next->previous = sourceToken->previous;
  1823                         } else {
  1824                             sourceToken->listHeader->last = sourceToken->previous;
  1827                         if(sourceToken->previous != NULL) {
  1828                             sourceToken->previous->next = sourceToken->next;
  1829                         } else {
  1830                             sourceToken->listHeader->first = sourceToken->next;
  1832                         sourceToken->next = NULL;
  1833                         sourceToken->previous = NULL;
  1837                 sourceToken->strength = src->parsedToken.strength;
  1838                 sourceToken->listHeader = lastToken->listHeader;
  1840                 /*
  1841                 1.  Find the strongest strength in each list, and set strongestP and strongestN
  1842                 accordingly in the headers.
  1843                 */
  1844                 if(lastStrength == UCOL_TOK_RESET
  1845                     || sourceToken->listHeader->first == 0) {
  1846                         /* If LAST is a reset
  1847                         insert sourceToken in the list. */
  1848                         if(sourceToken->listHeader->first == 0) {
  1849                             sourceToken->listHeader->first = sourceToken;
  1850                             sourceToken->listHeader->last = sourceToken;
  1851                         } else { /* we need to find a place for us */
  1852                             /* and we'll get in front of the same strength */
  1853                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
  1854                                 sourceToken->next = sourceToken->listHeader->first;
  1855                                 sourceToken->next->previous = sourceToken;
  1856                                 sourceToken->listHeader->first = sourceToken;
  1857                                 sourceToken->previous = NULL;
  1858                             } else {
  1859                                 lastToken = sourceToken->listHeader->first;
  1860                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
  1861                                     lastToken = lastToken->next;
  1863                                 if(lastToken->next != NULL) {
  1864                                     lastToken->next->previous = sourceToken;
  1865                                 } else {
  1866                                     sourceToken->listHeader->last = sourceToken;
  1868                                 sourceToken->previous = lastToken;
  1869                                 sourceToken->next = lastToken->next;
  1870                                 lastToken->next = sourceToken;
  1873                     } else {
  1874                         /* Otherwise (when LAST is not a reset)
  1875                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
  1876                         otherwise insert before.
  1877                         when inserting after or before, search to the next position with the same
  1878                         strength in that direction. (This is called postpone insertion).         */
  1879                         if(sourceToken != lastToken) {
  1880                             if(lastToken->polarity == sourceToken->polarity) {
  1881                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
  1882                                     lastToken = lastToken->next;
  1884                                 sourceToken->previous = lastToken;
  1885                                 if(lastToken->next != NULL) {
  1886                                     lastToken->next->previous = sourceToken;
  1887                                 } else {
  1888                                     sourceToken->listHeader->last = sourceToken;
  1891                                 sourceToken->next = lastToken->next;
  1892                                 lastToken->next = sourceToken;
  1893                             } else {
  1894                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
  1895                                     lastToken = lastToken->previous;
  1897                                 sourceToken->next = lastToken;
  1898                                 if(lastToken->previous != NULL) {
  1899                                     lastToken->previous->next = sourceToken;
  1900                                 } else {
  1901                                     sourceToken->listHeader->first = sourceToken;
  1903                                 sourceToken->previous = lastToken->previous;
  1904                                 lastToken->previous = sourceToken;
  1906                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
  1907                             if(lastStrength < sourceToken->strength) {
  1908                                 sourceToken->strength = lastStrength;
  1913                     /* if the token was a variable top, we're gonna put it in */
  1914                     if(variableTop == TRUE && src->varTop == NULL) {
  1915                         variableTop = FALSE;
  1916                         src->varTop = sourceToken;
  1919                     // Treat the expansions.
  1920                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
  1921                     // (&abc * d * e <=> &ab * d / c * e / c)
  1922                     // if both of them are in effect for a token, they are combined.
  1924                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
  1926                     if(expandNext != 0) {
  1927                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
  1928                             expandNext = 0;
  1929                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
  1930                             sourceToken->expansion = expandNext;
  1931                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
  1932                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
  1933                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
  1934                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
  1935                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
  1939                     // This is just for debugging purposes
  1940                     if(sourceToken->expansion != 0) {
  1941                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
  1942                     } else {
  1943                         sourceToken->debugExpansion = 0;
  1945                     // if the previous token was a reset before, the strength of this
  1946                     // token must match the strength of before. Otherwise we have an
  1947                     // undefined situation.
  1948                     // In other words, we currently have a cludge which we use to
  1949                     // represent &a >> x. This is written as &[before 2]a << x.
  1950                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
  1951                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
  1952                         if(beforeStrength != sourceToken->strength) {
  1953                             *status = U_INVALID_FORMAT_ERROR;
  1954                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
  1955                             DBG_FORMAT_ERROR
  1956                             return 0;
  1959             } else {
  1960                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
  1961                     /* if the previous token was also a reset, */
  1962                     /*this means that we have two consecutive resets */
  1963                     /* and we want to remove the previous one if empty*/
  1964                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
  1965                         src->resultLen--;
  1969                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
  1970                     uint32_t searchCharsLen = src->parsedToken.charsLen;
  1971                     while(searchCharsLen > 1 && sourceToken == NULL) {
  1972                         searchCharsLen--;
  1973                         //key = searchCharsLen << 24 | charsOffset;
  1974                         UColToken key;
  1975                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
  1976                         key.rulesToParseHdl = &(src->source);
  1977                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
  1979                     if(sourceToken != NULL) {
  1980                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
  1984                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
  1985                     if(top == FALSE) { /* there is no indirection */
  1986                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
  1987                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
  1988                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
  1989                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
  1990                                 sourceToken = sourceToken->previous;
  1992                             /* here, either we hit the strength or NULL */
  1993                             if(sourceToken->strength == strength) {
  1994                                 if(sourceToken->previous != NULL) {
  1995                                     sourceToken = sourceToken->previous;
  1996                                 } else { /* start of list */
  1997                                     sourceToken = sourceToken->listHeader->reset;
  1999                             } else { /* we hit NULL */
  2000                                 /* we should be doing the else part */
  2001                                 sourceToken = sourceToken->listHeader->reset;
  2002                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
  2004                         } else {
  2005                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
  2007                     } else { /* this is both before and indirection */
  2008                         top = FALSE;
  2009                         ListList[src->resultLen].previousCE = 0;
  2010                         ListList[src->resultLen].previousContCE = 0;
  2011                         ListList[src->resultLen].indirect = TRUE;
  2012                         /* we need to do slightly more work. we need to get the baseCE using the */
  2013                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
  2014                         /* in ucol_bld */
  2015                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
  2016                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
  2017                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
  2018                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
  2020                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  2021                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && 
  2022                            (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
  2023                             uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
  2024                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
  2025                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
  2026                             CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
  2027                             SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
  2028                         } else {
  2029                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
  2030                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
  2033                         ListList[src->resultLen].baseCE = CE;
  2034                         ListList[src->resultLen].baseContCE = SecondCE;
  2035                         ListList[src->resultLen].nextCE = 0;
  2036                         ListList[src->resultLen].nextContCE = 0;
  2038                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  2043                 /*  5 If the relation is a reset:
  2044                 If sourceToken is null
  2045                 Create new list, create new sourceToken, make the baseCE from source, put
  2046                 the sourceToken in ListHeader of the new list */
  2047                 if(sourceToken == NULL) {
  2048                     /*
  2049                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
  2050                     First convert all expansions into normal form. Examples:
  2051                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
  2052                     d * ... into &x * c/y * d * ...
  2053                     Note: reset values can never have expansions, although they can cause the
  2054                     very next item to have one. They may be contractions, if they are found
  2055                     earlier in the list.
  2056                     */
  2057                     if(top == FALSE) {
  2058                         collIterate s;
  2059                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
  2061                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
  2063                         CE = ucol_getNextCE(src->UCA, &s, status);
  2064                         const UChar *expand = s.pos;
  2065                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
  2067                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
  2068                         if(isContinuation(SecondCE)) {
  2069                             ListList[src->resultLen].baseContCE = SecondCE;
  2070                         } else {
  2071                             ListList[src->resultLen].baseContCE = 0;
  2073                         ListList[src->resultLen].nextCE = 0;
  2074                         ListList[src->resultLen].nextContCE = 0;
  2075                         ListList[src->resultLen].previousCE = 0;
  2076                         ListList[src->resultLen].previousContCE = 0;
  2077                         ListList[src->resultLen].indirect = FALSE;
  2078                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
  2079                     } else { /* top == TRUE */
  2080                         /* just use the supplied values */
  2081                         top = FALSE;
  2082                         ListList[src->resultLen].previousCE = 0;
  2083                         ListList[src->resultLen].previousContCE = 0;
  2084                         ListList[src->resultLen].indirect = TRUE;
  2085                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
  2086                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
  2087                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
  2088                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
  2090                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
  2093                 } else { /* reset to something already in rules */
  2094                     top = FALSE;
  2097             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
  2098             lastToken = sourceToken;
  2099         } else {
  2100             if(U_FAILURE(*status)) {
  2101                 return 0;
  2105 #ifdef DEBUG_FOR_CODE_POINTS
  2106     fclose(dfcp_fp);
  2107 #endif
  2110     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
  2111         src->resultLen--;
  2113     return src->resultLen;
  2116 const UChar* ucol_tok_getRulesFromBundle(
  2117     void* /*context*/,
  2118     const char* locale,
  2119     const char* type,
  2120     int32_t* pLength,
  2121     UErrorCode* status)
  2123     const UChar* rules = NULL;
  2124     UResourceBundle* bundle;
  2125     UResourceBundle* collations;
  2126     UResourceBundle* collation;
  2128     *pLength = 0;
  2130     bundle = ures_open(U_ICUDATA_COLL, locale, status);
  2131     if(U_SUCCESS(*status)){
  2132         collations = ures_getByKey(bundle, "collations", NULL, status);
  2133         if(U_SUCCESS(*status)){
  2134             collation = ures_getByKey(collations, type, NULL, status);
  2135             if(U_SUCCESS(*status)){
  2136                 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
  2137                 if(U_FAILURE(*status)){
  2138                     *pLength = 0;
  2139                     rules = NULL;
  2141                 ures_close(collation);
  2143             ures_close(collations);
  2147     ures_close(bundle);
  2149     return rules;
  2152 void ucol_tok_initTokenList(
  2153     UColTokenParser *src,
  2154     const UChar *rules,
  2155     uint32_t rulesLength,
  2156     const UCollator *UCA,
  2157     GetCollationRulesFunction importFunc,
  2158     void* context, 
  2159     UErrorCode *status) {
  2160     U_NAMESPACE_USE
  2162     uint32_t nSize = 0;
  2163     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
  2165     bool needToDeallocRules = false;
  2167     if(U_FAILURE(*status)) {
  2168         return;
  2171     // set everything to zero, so that we can clean up gracefully
  2172     uprv_memset(src, 0, sizeof(UColTokenParser));
  2174     // first we need to find options that don't like to be normalized,
  2175     // like copy and remove...
  2176     //const UChar *openBrace = rules;
  2177     int32_t optionNumber = -1;
  2178     const UChar *setStart = NULL;
  2179     uint32_t i = 0;
  2180     while(i < rulesLength) {
  2181         if(rules[i] == 0x005B) {    // '[': start of an option
  2182             /* Gets the following:
  2183                optionNumber: The index of the option.
  2184                setStart: The pointer at which the option arguments start.
  2185              */
  2186             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
  2188             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
  2189                 // [optimize]
  2190                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
  2191                 if(U_SUCCESS(*status)) {
  2192                     if(src->copySet == NULL) {
  2193                         src->copySet = newSet;
  2194                     } else {
  2195                         uset_addAll(src->copySet, newSet);
  2196                         uset_close(newSet);
  2198                 } else {
  2199                     return;
  2201             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
  2202                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
  2203                 if(U_SUCCESS(*status)) {
  2204                     if(src->removeSet == NULL) {
  2205                         src->removeSet = newSet;
  2206                     } else {
  2207                         uset_addAll(src->removeSet, newSet);
  2208                         uset_close(newSet);
  2210                 } else {
  2211                     return;
  2213             } else if(optionNumber == OPTION_IMPORT){
  2214                 // [import <collation-name>]
  2216                 // Find the address of the closing ].
  2217                 UChar* import_end = u_strchr(setStart, 0x005D);
  2218                 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
  2219                 // Ignore trailing whitespace.
  2220                 while(PatternProps::isWhiteSpace(*(import_end-1))) {
  2221                     --import_end;
  2224                 int32_t optionLength = (int32_t)(import_end - setStart);
  2225                 char option[50];
  2226                 if(optionLength >= (int32_t)sizeof(option)) {
  2227                     *status = U_ILLEGAL_ARGUMENT_ERROR;
  2228                     return;
  2230                 u_UCharsToChars(setStart, option, optionLength);
  2231                 option[optionLength] = 0;
  2233                 *status = U_ZERO_ERROR;
  2234                 char locale[50];
  2235                 int32_t templ;
  2236                 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
  2237                 if(U_FAILURE(*status)) {
  2238                     *status = U_ILLEGAL_ARGUMENT_ERROR;
  2239                     return;
  2242                 char type[50];
  2243                 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
  2244                     U_FAILURE(*status)
  2245                 ) {
  2246                     *status = U_ZERO_ERROR;
  2247                     uprv_strcpy(type, "standard");
  2250                 // TODO: Use public functions when available, see ticket #8134.
  2251                 char *keywords = (char *)locale_getKeywordsStart(locale);
  2252                 if(keywords != NULL) {
  2253                     *keywords = 0;
  2256                 int32_t importRulesLength = 0;
  2257                 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
  2259 #ifdef DEBUG_FOR_COLL_RULES
  2260                 std::string s;
  2261                 UnicodeString(importRules).toUTF8String(s);
  2262                 std::cout << "Import rules = " << s << std::endl;
  2263 #endif
  2265                 // Add the length of the imported rules to length of the original rules,
  2266                 // and subtract the length of the import option.
  2267                 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
  2269                 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
  2271 #ifdef DEBUG_FOR_COLL_RULES
  2272                 std::string s1;
  2273                 UnicodeString(rules).toUTF8String(s1);
  2274                 std::cout << "Original rules = " << s1 << std::endl;
  2275 #endif
  2278                 // Copy the section of the original rules leading up to the import
  2279                 uprv_memcpy(newRules, rules, i*sizeof(UChar));
  2280                 // Copy the imported rules
  2281                 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
  2282                 // Copy the rest of the original rules (minus the import option itself)
  2283                 uprv_memcpy(newRules+i+importRulesLength,
  2284                             rules+optionEndOffset,
  2285                             (rulesLength-optionEndOffset)*sizeof(UChar));
  2287 #ifdef DEBUG_FOR_COLL_RULES
  2288                 std::string s2;
  2289                 UnicodeString(newRules).toUTF8String(s2);
  2290                 std::cout << "Resulting rules = " << s2 << std::endl;
  2291 #endif
  2293                 if(needToDeallocRules){
  2294                     // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
  2295                     uprv_free((void*)rules);
  2297                 needToDeallocRules = true;
  2298                 rules = newRules;
  2299                 rulesLength = newRulesLength;
  2301                 estimatedSize += importRulesLength*2;
  2303                 // First character of the new rules needs to be processed
  2304                 i--;
  2307         //openBrace++;
  2308         i++;
  2311     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
  2312     /* test for NULL */
  2313     if (src->source == NULL) {
  2314         *status = U_MEMORY_ALLOCATION_ERROR;
  2315         return;
  2317     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
  2318     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
  2319     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
  2320         *status = U_ZERO_ERROR;
  2321         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
  2322         /* test for NULL */
  2323         if (src->source == NULL) {
  2324             *status = U_MEMORY_ALLOCATION_ERROR;
  2325             return;
  2327         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
  2329     if(needToDeallocRules){
  2330         // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
  2331         uprv_free((void*)rules);
  2335     src->current = src->source;
  2336     src->end = src->source+nSize;
  2337     src->sourceCurrent = src->source;
  2338     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
  2339     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  2340     src->varTop = NULL;
  2341     src->UCA = UCA;
  2342     src->invUCA = ucol_initInverseUCA(status);
  2343     src->parsedToken.charsLen = 0;
  2344     src->parsedToken.charsOffset = 0;
  2345     src->parsedToken.extensionLen = 0;
  2346     src->parsedToken.extensionOffset = 0;
  2347     src->parsedToken.prefixLen = 0;
  2348     src->parsedToken.prefixOffset = 0;
  2349     src->parsedToken.flags = 0;
  2350     src->parsedToken.strength = UCOL_TOK_UNSET;
  2351     src->buildCCTabFlag = FALSE;
  2352     src->isStarred = FALSE;
  2353     src->inRange = FALSE;
  2354     src->lastRangeCp = 0;
  2355     src->previousCp = 0;
  2357     if(U_FAILURE(*status)) {
  2358         return;
  2360     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
  2361     if(U_FAILURE(*status)) {
  2362         return;
  2364     uhash_setValueDeleter(src->tailored, uprv_free);
  2366     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
  2367     /* test for NULL */
  2368     if (src->opts == NULL) {
  2369         *status = U_MEMORY_ALLOCATION_ERROR;
  2370         return;
  2373     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
  2375     src->lh = 0;
  2376     src->listCapacity = 1024;
  2377     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
  2378     //Test for NULL
  2379     if (src->lh == NULL) {
  2380         *status = U_MEMORY_ALLOCATION_ERROR;
  2381         return;
  2383     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
  2384     src->resultLen = 0;
  2386     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
  2388     // UCOL_RESET_TOP_VALUE
  2389     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
  2390     // UCOL_FIRST_PRIMARY_IGNORABLE
  2391     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
  2392     // UCOL_LAST_PRIMARY_IGNORABLE
  2393     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
  2394     // UCOL_FIRST_SECONDARY_IGNORABLE
  2395     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
  2396     // UCOL_LAST_SECONDARY_IGNORABLE
  2397     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
  2398     // UCOL_FIRST_TERTIARY_IGNORABLE
  2399     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
  2400     // UCOL_LAST_TERTIARY_IGNORABLE
  2401     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
  2402     // UCOL_FIRST_VARIABLE
  2403     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
  2404     // UCOL_LAST_VARIABLE
  2405     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
  2406     // UCOL_FIRST_NON_VARIABLE
  2407     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
  2408     // UCOL_LAST_NON_VARIABLE
  2409     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
  2410     // UCOL_FIRST_IMPLICIT
  2411     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
  2412     // UCOL_LAST_IMPLICIT
  2413     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
  2414     // UCOL_FIRST_TRAILING
  2415     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
  2416     // UCOL_LAST_TRAILING
  2417     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
  2418     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
  2422 void ucol_tok_closeTokenList(UColTokenParser *src) {
  2423     if(src->copySet != NULL) {
  2424         uset_close(src->copySet);
  2426     if(src->removeSet != NULL) {
  2427         uset_close(src->removeSet);
  2429     if(src->tailored != NULL) {
  2430         uhash_close(src->tailored);
  2432     if(src->lh != NULL) {
  2433         uprv_free(src->lh);
  2435     if(src->source != NULL) {
  2436         uprv_free(src->source);
  2438     if(src->opts != NULL) {
  2439         uprv_free(src->opts);
  2441     if (src->reorderCodes != NULL) {
  2442         uprv_free(src->reorderCodes);
  2446 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial