1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol_tok.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2446 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2001-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucol_tok.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created 02/22/2001 1.17 +* created by: Vladimir Weinstein 1.18 +* 1.19 +* This module reads a tailoring rule string and produces a list of 1.20 +* tokens that will be turned into collation elements 1.21 +* 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 + 1.26 +#if !UCONFIG_NO_COLLATION 1.27 + 1.28 +#include "unicode/uscript.h" 1.29 +#include "unicode/ustring.h" 1.30 +#include "unicode/uchar.h" 1.31 +#include "unicode/uniset.h" 1.32 + 1.33 +#include "cmemory.h" 1.34 +#include "cstring.h" 1.35 +#include "patternprops.h" 1.36 +#include "ucol_bld.h" 1.37 +#include "ucol_tok.h" 1.38 +#include "ulocimp.h" 1.39 +#include "uresimp.h" 1.40 + 1.41 +// Define this only for debugging. 1.42 +// #define DEBUG_FOR_COLL_RULES 1 1.43 + 1.44 +#ifdef DEBUG_FOR_COLL_RULES 1.45 +#include <iostream> 1.46 +#endif 1.47 + 1.48 +U_NAMESPACE_USE 1.49 + 1.50 +U_CDECL_BEGIN 1.51 +static int32_t U_CALLCONV 1.52 +uhash_hashTokens(const UHashTok k) 1.53 +{ 1.54 + int32_t hash = 0; 1.55 + //uint32_t key = (uint32_t)k.integer; 1.56 + UColToken *key = (UColToken *)k.pointer; 1.57 + if (key != 0) { 1.58 + int32_t len = (key->source & 0xFF000000)>>24; 1.59 + int32_t inc = ((len - 32) / 32) + 1; 1.60 + 1.61 + const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); 1.62 + const UChar *limit = p + len; 1.63 + 1.64 + while (p<limit) { 1.65 + hash = (hash * 37) + *p; 1.66 + p += inc; 1.67 + } 1.68 + } 1.69 + return hash; 1.70 +} 1.71 + 1.72 +static UBool U_CALLCONV 1.73 +uhash_compareTokens(const UHashTok key1, const UHashTok key2) 1.74 +{ 1.75 + //uint32_t p1 = (uint32_t) key1.integer; 1.76 + //uint32_t p2 = (uint32_t) key2.integer; 1.77 + UColToken *p1 = (UColToken *)key1.pointer; 1.78 + UColToken *p2 = (UColToken *)key2.pointer; 1.79 + const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); 1.80 + const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); 1.81 + uint32_t s1L = ((p1->source & 0xFF000000) >> 24); 1.82 + uint32_t s2L = ((p2->source & 0xFF000000) >> 24); 1.83 + const UChar *end = s1+s1L-1; 1.84 + 1.85 + if (p1 == p2) { 1.86 + return TRUE; 1.87 + } 1.88 + if (p1->source == 0 || p2->source == 0) { 1.89 + return FALSE; 1.90 + } 1.91 + if(s1L != s2L) { 1.92 + return FALSE; 1.93 + } 1.94 + if(p1->source == p2->source) { 1.95 + return TRUE; 1.96 + } 1.97 + while((s1 < end) && *s1 == *s2) { 1.98 + ++s1; 1.99 + ++s2; 1.100 + } 1.101 + if(*s1 == *s2) { 1.102 + return TRUE; 1.103 + } else { 1.104 + return FALSE; 1.105 + } 1.106 +} 1.107 +U_CDECL_END 1.108 + 1.109 +/* 1.110 + * Debug messages used to pinpoint where a format error occurred. 1.111 + * A better way is to include context-sensitive information in syntaxError() function. 1.112 + * 1.113 + * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR 1.114 + * in the compile line. 1.115 + */ 1.116 +/* #define DEBUG_FOR_FORMAT_ERROR 1 */ 1.117 + 1.118 +#ifdef DEBUG_FOR_FORMAT_ERROR 1.119 +#define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} 1.120 +#else 1.121 +#define DBG_FORMAT_ERROR 1.122 +#endif 1.123 + 1.124 + 1.125 +/* 1.126 + * Controls debug messages so that the output can be compared before and after a 1.127 + * big change. Prints the information of every code point that comes out of the 1.128 + * collation parser and its strength into a file. When a big change in format 1.129 + * happens, the files before and after the change should be identical. 1.130 + * 1.131 + * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS 1.132 + * in the compile line. 1.133 + */ 1.134 +// #define DEBUG_FOR_CODE_POINTS 1 1.135 + 1.136 +#ifdef DEBUG_FOR_CODE_POINTS 1.137 + FILE* dfcp_fp = NULL; 1.138 +#endif 1.139 + 1.140 + 1.141 +typedef struct { 1.142 + uint32_t startCE; 1.143 + uint32_t startContCE; 1.144 + uint32_t limitCE; 1.145 + uint32_t limitContCE; 1.146 +} indirectBoundaries; 1.147 + 1.148 +/* these values are used for finding CE values for indirect positioning. */ 1.149 +/* Indirect positioning is a mechanism for allowing resets on symbolic */ 1.150 +/* values. It only works for resets and you cannot tailor indirect names */ 1.151 +/* An indirect name can define either an anchor point or a range. An */ 1.152 +/* anchor point behaves in exactly the same way as a code point in reset */ 1.153 +/* would, except that it cannot be tailored. A range (we currently only */ 1.154 +/* know for the [top] range will explicitly set the upper bound for */ 1.155 +/* generated CEs, thus allowing for better control over how many CEs can */ 1.156 +/* be squeezed between in the range without performance penalty. */ 1.157 +/* In that respect, we use [top] for tailoring of locales that use CJK */ 1.158 +/* characters. Other indirect values are currently a pure convenience, */ 1.159 +/* they can be used to assure that the CEs will be always positioned in */ 1.160 +/* the same place relative to a point with known properties (e.g. first */ 1.161 +/* primary ignorable). */ 1.162 +static indirectBoundaries ucolIndirectBoundaries[15]; 1.163 +/* 1.164 +static indirectBoundaries ucolIndirectBoundaries[11] = { 1.165 +{ UCOL_RESET_TOP_VALUE, 0, 1.166 +UCOL_NEXT_TOP_VALUE, 0 }, 1.167 +{ UCOL_FIRST_PRIMARY_IGNORABLE, 0, 1.168 +0, 0 }, 1.169 +{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, 1.170 +0, 0 }, 1.171 +{ UCOL_FIRST_SECONDARY_IGNORABLE, 0, 1.172 +0, 0 }, 1.173 +{ UCOL_LAST_SECONDARY_IGNORABLE, 0, 1.174 +0, 0 }, 1.175 +{ UCOL_FIRST_TERTIARY_IGNORABLE, 0, 1.176 +0, 0 }, 1.177 +{ UCOL_LAST_TERTIARY_IGNORABLE, 0, 1.178 +0, 0 }, 1.179 +{ UCOL_FIRST_VARIABLE, 0, 1.180 +0, 0 }, 1.181 +{ UCOL_LAST_VARIABLE, 0, 1.182 +0, 0 }, 1.183 +{ UCOL_FIRST_NON_VARIABLE, 0, 1.184 +0, 0 }, 1.185 +{ UCOL_LAST_NON_VARIABLE, 0, 1.186 +0, 0 }, 1.187 +}; 1.188 +*/ 1.189 + 1.190 +static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { 1.191 + 1.192 + // Set values for the top - TODO: once we have values for all the indirects, we are going 1.193 + // to initalize here. 1.194 + ucolIndirectBoundaries[indexR].startCE = start[0]; 1.195 + ucolIndirectBoundaries[indexR].startContCE = start[1]; 1.196 + if(end) { 1.197 + ucolIndirectBoundaries[indexR].limitCE = end[0]; 1.198 + ucolIndirectBoundaries[indexR].limitContCE = end[1]; 1.199 + } else { 1.200 + ucolIndirectBoundaries[indexR].limitCE = 0; 1.201 + ucolIndirectBoundaries[indexR].limitContCE = 0; 1.202 + } 1.203 +} 1.204 + 1.205 + 1.206 +static inline 1.207 +void syntaxError(const UChar* rules, 1.208 + int32_t pos, 1.209 + int32_t rulesLen, 1.210 + UParseError* parseError) 1.211 +{ 1.212 + parseError->offset = pos; 1.213 + parseError->line = 0 ; /* we are not using line numbers */ 1.214 + 1.215 + // for pre-context 1.216 + int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); 1.217 + int32_t stop = pos; 1.218 + 1.219 + u_memcpy(parseError->preContext,rules+start,stop-start); 1.220 + //null terminate the buffer 1.221 + parseError->preContext[stop-start] = 0; 1.222 + 1.223 + //for post-context 1.224 + start = pos+1; 1.225 + stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : 1.226 + rulesLen; 1.227 + 1.228 + if(start < stop) { 1.229 + u_memcpy(parseError->postContext,rules+start,stop-start); 1.230 + //null terminate the buffer 1.231 + parseError->postContext[stop-start]= 0; 1.232 + } else { 1.233 + parseError->postContext[0] = 0; 1.234 + } 1.235 +} 1.236 + 1.237 +static 1.238 +void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { 1.239 + switch(attrib) { 1.240 + case UCOL_HIRAGANA_QUATERNARY_MODE: 1.241 + opts->hiraganaQ = value; 1.242 + break; 1.243 + case UCOL_FRENCH_COLLATION: 1.244 + opts->frenchCollation = value; 1.245 + break; 1.246 + case UCOL_ALTERNATE_HANDLING: 1.247 + opts->alternateHandling = value; 1.248 + break; 1.249 + case UCOL_CASE_FIRST: 1.250 + opts->caseFirst = value; 1.251 + break; 1.252 + case UCOL_CASE_LEVEL: 1.253 + opts->caseLevel = value; 1.254 + break; 1.255 + case UCOL_NORMALIZATION_MODE: 1.256 + opts->normalizationMode = value; 1.257 + break; 1.258 + case UCOL_STRENGTH: 1.259 + opts->strength = value; 1.260 + break; 1.261 + case UCOL_NUMERIC_COLLATION: 1.262 + opts->numericCollation = value; 1.263 + break; 1.264 + case UCOL_ATTRIBUTE_COUNT: 1.265 + default: 1.266 + break; 1.267 + } 1.268 +} 1.269 + 1.270 +#define UTOK_OPTION_COUNT 22 1.271 + 1.272 +static UBool didInit = FALSE; 1.273 +/* we can be strict, or we can be lenient */ 1.274 +/* I'd surely be lenient with the option arguments */ 1.275 +/* maybe even with options */ 1.276 +U_STRING_DECL(suboption_00, "non-ignorable", 13); 1.277 +U_STRING_DECL(suboption_01, "shifted", 7); 1.278 + 1.279 +U_STRING_DECL(suboption_02, "lower", 5); 1.280 +U_STRING_DECL(suboption_03, "upper", 5); 1.281 +U_STRING_DECL(suboption_04, "off", 3); 1.282 +U_STRING_DECL(suboption_05, "on", 2); 1.283 +U_STRING_DECL(suboption_06, "1", 1); 1.284 +U_STRING_DECL(suboption_07, "2", 1); 1.285 +U_STRING_DECL(suboption_08, "3", 1); 1.286 +U_STRING_DECL(suboption_09, "4", 1); 1.287 +U_STRING_DECL(suboption_10, "I", 1); 1.288 + 1.289 +U_STRING_DECL(suboption_11, "primary", 7); 1.290 +U_STRING_DECL(suboption_12, "secondary", 9); 1.291 +U_STRING_DECL(suboption_13, "tertiary", 8); 1.292 +U_STRING_DECL(suboption_14, "variable", 8); 1.293 +U_STRING_DECL(suboption_15, "regular", 7); 1.294 +U_STRING_DECL(suboption_16, "implicit", 8); 1.295 +U_STRING_DECL(suboption_17, "trailing", 8); 1.296 + 1.297 + 1.298 +U_STRING_DECL(option_00, "undefined", 9); 1.299 +U_STRING_DECL(option_01, "rearrange", 9); 1.300 +U_STRING_DECL(option_02, "alternate", 9); 1.301 +U_STRING_DECL(option_03, "backwards", 9); 1.302 +U_STRING_DECL(option_04, "variable top", 12); 1.303 +U_STRING_DECL(option_05, "top", 3); 1.304 +U_STRING_DECL(option_06, "normalization", 13); 1.305 +U_STRING_DECL(option_07, "caseLevel", 9); 1.306 +U_STRING_DECL(option_08, "caseFirst", 9); 1.307 +U_STRING_DECL(option_09, "scriptOrder", 11); 1.308 +U_STRING_DECL(option_10, "charsetname", 11); 1.309 +U_STRING_DECL(option_11, "charset", 7); 1.310 +U_STRING_DECL(option_12, "before", 6); 1.311 +U_STRING_DECL(option_13, "hiraganaQ", 9); 1.312 +U_STRING_DECL(option_14, "strength", 8); 1.313 +U_STRING_DECL(option_15, "first", 5); 1.314 +U_STRING_DECL(option_16, "last", 4); 1.315 +U_STRING_DECL(option_17, "optimize", 8); 1.316 +U_STRING_DECL(option_18, "suppressContractions", 20); 1.317 +U_STRING_DECL(option_19, "numericOrdering", 15); 1.318 +U_STRING_DECL(option_20, "import", 6); 1.319 +U_STRING_DECL(option_21, "reorder", 7); 1.320 + 1.321 +/* 1.322 +[last variable] last variable value 1.323 +[last primary ignorable] largest CE for primary ignorable 1.324 +[last secondary ignorable] largest CE for secondary ignorable 1.325 +[last tertiary ignorable] largest CE for tertiary ignorable 1.326 +[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) 1.327 +*/ 1.328 + 1.329 + 1.330 +static const ucolTokSuboption alternateSub[2] = { 1.331 + {suboption_00, 13, UCOL_NON_IGNORABLE}, 1.332 + {suboption_01, 7, UCOL_SHIFTED} 1.333 +}; 1.334 + 1.335 +static const ucolTokSuboption caseFirstSub[3] = { 1.336 + {suboption_02, 5, UCOL_LOWER_FIRST}, 1.337 + {suboption_03, 5, UCOL_UPPER_FIRST}, 1.338 + {suboption_04, 3, UCOL_OFF}, 1.339 +}; 1.340 + 1.341 +static const ucolTokSuboption onOffSub[2] = { 1.342 + {suboption_04, 3, UCOL_OFF}, 1.343 + {suboption_05, 2, UCOL_ON} 1.344 +}; 1.345 + 1.346 +static const ucolTokSuboption frenchSub[1] = { 1.347 + {suboption_07, 1, UCOL_ON} 1.348 +}; 1.349 + 1.350 +static const ucolTokSuboption beforeSub[3] = { 1.351 + {suboption_06, 1, UCOL_PRIMARY}, 1.352 + {suboption_07, 1, UCOL_SECONDARY}, 1.353 + {suboption_08, 1, UCOL_TERTIARY} 1.354 +}; 1.355 + 1.356 +static const ucolTokSuboption strengthSub[5] = { 1.357 + {suboption_06, 1, UCOL_PRIMARY}, 1.358 + {suboption_07, 1, UCOL_SECONDARY}, 1.359 + {suboption_08, 1, UCOL_TERTIARY}, 1.360 + {suboption_09, 1, UCOL_QUATERNARY}, 1.361 + {suboption_10, 1, UCOL_IDENTICAL}, 1.362 +}; 1.363 + 1.364 +static const ucolTokSuboption firstLastSub[7] = { 1.365 + {suboption_11, 7, UCOL_PRIMARY}, 1.366 + {suboption_12, 9, UCOL_PRIMARY}, 1.367 + {suboption_13, 8, UCOL_PRIMARY}, 1.368 + {suboption_14, 8, UCOL_PRIMARY}, 1.369 + {suboption_15, 7, UCOL_PRIMARY}, 1.370 + {suboption_16, 8, UCOL_PRIMARY}, 1.371 + {suboption_17, 8, UCOL_PRIMARY}, 1.372 +}; 1.373 + 1.374 +enum OptionNumber { 1.375 + OPTION_ALTERNATE_HANDLING = 0, 1.376 + OPTION_FRENCH_COLLATION, 1.377 + OPTION_CASE_LEVEL, 1.378 + OPTION_CASE_FIRST, 1.379 + OPTION_NORMALIZATION_MODE, 1.380 + OPTION_HIRAGANA_QUATERNARY, 1.381 + OPTION_STRENGTH, 1.382 + OPTION_NUMERIC_COLLATION, 1.383 + OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, 1.384 + OPTION_VARIABLE_TOP, 1.385 + OPTION_REARRANGE, 1.386 + OPTION_BEFORE, 1.387 + OPTION_TOP, 1.388 + OPTION_FIRST, 1.389 + OPTION_LAST, 1.390 + OPTION_OPTIMIZE, 1.391 + OPTION_SUPPRESS_CONTRACTIONS, 1.392 + OPTION_UNDEFINED, 1.393 + OPTION_SCRIPT_ORDER, 1.394 + OPTION_CHARSET_NAME, 1.395 + OPTION_CHARSET, 1.396 + OPTION_IMPORT, 1.397 + OPTION_SCRIPTREORDER 1.398 +} ; 1.399 + 1.400 +static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { 1.401 + /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ 1.402 + /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ 1.403 + /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ 1.404 + /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ 1.405 + /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ 1.406 + /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ 1.407 + /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ 1.408 + /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ 1.409 + /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ 1.410 + /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ 1.411 + /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ 1.412 + /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ 1.413 + /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ 1.414 + /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ 1.415 + /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ 1.416 + /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ 1.417 + /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ 1.418 + /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ 1.419 + /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ 1.420 + /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ 1.421 + /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ 1.422 + /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ 1.423 +}; 1.424 + 1.425 +static 1.426 +int32_t u_strncmpNoCase(const UChar *s1, 1.427 + const UChar *s2, 1.428 + int32_t n) 1.429 +{ 1.430 + if(n > 0) { 1.431 + int32_t rc; 1.432 + for(;;) { 1.433 + rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); 1.434 + if(rc != 0 || *s1 == 0 || --n == 0) { 1.435 + return rc; 1.436 + } 1.437 + ++s1; 1.438 + ++s2; 1.439 + } 1.440 + } 1.441 + return 0; 1.442 +} 1.443 + 1.444 +static 1.445 +void ucol_uprv_tok_initData() { 1.446 + if(!didInit) { 1.447 + U_STRING_INIT(suboption_00, "non-ignorable", 13); 1.448 + U_STRING_INIT(suboption_01, "shifted", 7); 1.449 + 1.450 + U_STRING_INIT(suboption_02, "lower", 5); 1.451 + U_STRING_INIT(suboption_03, "upper", 5); 1.452 + U_STRING_INIT(suboption_04, "off", 3); 1.453 + U_STRING_INIT(suboption_05, "on", 2); 1.454 + 1.455 + U_STRING_INIT(suboption_06, "1", 1); 1.456 + U_STRING_INIT(suboption_07, "2", 1); 1.457 + U_STRING_INIT(suboption_08, "3", 1); 1.458 + U_STRING_INIT(suboption_09, "4", 1); 1.459 + U_STRING_INIT(suboption_10, "I", 1); 1.460 + 1.461 + U_STRING_INIT(suboption_11, "primary", 7); 1.462 + U_STRING_INIT(suboption_12, "secondary", 9); 1.463 + U_STRING_INIT(suboption_13, "tertiary", 8); 1.464 + U_STRING_INIT(suboption_14, "variable", 8); 1.465 + U_STRING_INIT(suboption_15, "regular", 7); 1.466 + U_STRING_INIT(suboption_16, "implicit", 8); 1.467 + U_STRING_INIT(suboption_17, "trailing", 8); 1.468 + 1.469 + 1.470 + U_STRING_INIT(option_00, "undefined", 9); 1.471 + U_STRING_INIT(option_01, "rearrange", 9); 1.472 + U_STRING_INIT(option_02, "alternate", 9); 1.473 + U_STRING_INIT(option_03, "backwards", 9); 1.474 + U_STRING_INIT(option_04, "variable top", 12); 1.475 + U_STRING_INIT(option_05, "top", 3); 1.476 + U_STRING_INIT(option_06, "normalization", 13); 1.477 + U_STRING_INIT(option_07, "caseLevel", 9); 1.478 + U_STRING_INIT(option_08, "caseFirst", 9); 1.479 + U_STRING_INIT(option_09, "scriptOrder", 11); 1.480 + U_STRING_INIT(option_10, "charsetname", 11); 1.481 + U_STRING_INIT(option_11, "charset", 7); 1.482 + U_STRING_INIT(option_12, "before", 6); 1.483 + U_STRING_INIT(option_13, "hiraganaQ", 9); 1.484 + U_STRING_INIT(option_14, "strength", 8); 1.485 + U_STRING_INIT(option_15, "first", 5); 1.486 + U_STRING_INIT(option_16, "last", 4); 1.487 + U_STRING_INIT(option_17, "optimize", 8); 1.488 + U_STRING_INIT(option_18, "suppressContractions", 20); 1.489 + U_STRING_INIT(option_19, "numericOrdering", 15); 1.490 + U_STRING_INIT(option_20, "import ", 6); 1.491 + U_STRING_INIT(option_21, "reorder", 7); 1.492 + didInit = TRUE; 1.493 + } 1.494 +} 1.495 + 1.496 + 1.497 +// This function reads basic options to set in the runtime collator 1.498 +// used by data driven tests. Should not support build time options 1.499 +U_CAPI const UChar * U_EXPORT2 1.500 +ucol_tok_getNextArgument(const UChar *start, const UChar *end, 1.501 + UColAttribute *attrib, UColAttributeValue *value, 1.502 + UErrorCode *status) 1.503 +{ 1.504 + uint32_t i = 0; 1.505 + int32_t j=0; 1.506 + UBool foundOption = FALSE; 1.507 + const UChar *optionArg = NULL; 1.508 + 1.509 + ucol_uprv_tok_initData(); 1.510 + 1.511 + while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ 1.512 + start++; 1.513 + } 1.514 + if(start >= end) { 1.515 + return NULL; 1.516 + } 1.517 + /* skip opening '[' */ 1.518 + if(*start == 0x005b) { 1.519 + start++; 1.520 + } else { 1.521 + *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' 1.522 + return NULL; 1.523 + } 1.524 + 1.525 + while(i < UTOK_OPTION_COUNT) { 1.526 + if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 1.527 + foundOption = TRUE; 1.528 + if(end - start > rulesOptions[i].optionLen) { 1.529 + optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ 1.530 + while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ 1.531 + optionArg++; 1.532 + } 1.533 + } 1.534 + break; 1.535 + } 1.536 + i++; 1.537 + } 1.538 + 1.539 + if(!foundOption) { 1.540 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.541 + return NULL; 1.542 + } 1.543 + 1.544 + if(optionArg) { 1.545 + for(j = 0; j<rulesOptions[i].subSize; j++) { 1.546 + if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 1.547 + //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 1.548 + *attrib = rulesOptions[i].attr; 1.549 + *value = rulesOptions[i].subopts[j].attrVal; 1.550 + optionArg += rulesOptions[i].subopts[j].subLen; 1.551 + while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ 1.552 + optionArg++; 1.553 + } 1.554 + if(*optionArg == 0x005d) { 1.555 + optionArg++; 1.556 + return optionArg; 1.557 + } else { 1.558 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.559 + return NULL; 1.560 + } 1.561 + } 1.562 + } 1.563 + } 1.564 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.565 + return NULL; 1.566 +} 1.567 + 1.568 +static 1.569 +USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { 1.570 + while(*start != 0x005b) { /* advance while we find the first '[' */ 1.571 + start++; 1.572 + } 1.573 + // now we need to get a balanced set of '[]'. The problem is that a set can have 1.574 + // many, and *end point to the first closing '[' 1.575 + int32_t noOpenBraces = 1; 1.576 + int32_t current = 1; // skip the opening brace 1.577 + while(start+current < end && noOpenBraces != 0) { 1.578 + if(start[current] == 0x005b) { 1.579 + noOpenBraces++; 1.580 + } else if(start[current] == 0x005D) { // closing brace 1.581 + noOpenBraces--; 1.582 + } 1.583 + current++; 1.584 + } 1.585 + 1.586 + if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { 1.587 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.588 + return NULL; 1.589 + } 1.590 + return uset_openPattern(start, current, status); 1.591 +} 1.592 + 1.593 +/** 1.594 + * Reads an option and matches the option name with the predefined options. (Case-insensitive.) 1.595 + * @param start Pointer to the start UChar. 1.596 + * @param end Pointer to the last valid pointer beyond which the option will not extend. 1.597 + * @param optionArg Address of the pointer at which the options start (after the option name) 1.598 + * @return The index of the option, or -1 if the option is not valid. 1.599 + */ 1.600 +static 1.601 +int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { 1.602 + int32_t i = 0; 1.603 + ucol_uprv_tok_initData(); 1.604 + 1.605 + while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ 1.606 + start++; 1.607 + } 1.608 + while(i < UTOK_OPTION_COUNT) { 1.609 + if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { 1.610 + if(end - start > rulesOptions[i].optionLen) { 1.611 + *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ 1.612 + while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */ 1.613 + (*optionArg)++; 1.614 + } 1.615 + } 1.616 + break; 1.617 + } 1.618 + i++; 1.619 + } 1.620 + if(i == UTOK_OPTION_COUNT) { 1.621 + i = -1; // didn't find an option 1.622 + } 1.623 + return i; 1.624 +} 1.625 + 1.626 + 1.627 +static 1.628 +void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { 1.629 + int32_t codeCount = 0; 1.630 + int32_t codeIndex = 0; 1.631 + char conversion[64]; 1.632 + int32_t tokenLength = 0; 1.633 + const UChar* space; 1.634 + 1.635 + const UChar* current = src->current; 1.636 + const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); 1.637 + 1.638 + // eat leading whitespace 1.639 + while(current < end && u_isWhitespace(*current)) { 1.640 + current++; 1.641 + } 1.642 + 1.643 + while(current < end) { 1.644 + space = u_memchr(current, 0x0020, end - current); 1.645 + space = space == 0 ? end : space; 1.646 + tokenLength = space - current; 1.647 + if (tokenLength < 4) { 1.648 + *status = U_INVALID_FORMAT_ERROR; 1.649 + return; 1.650 + } 1.651 + codeCount++; 1.652 + current += tokenLength; 1.653 + while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 1.654 + ++current; 1.655 + } 1.656 + } 1.657 + 1.658 + if (codeCount == 0) { 1.659 + *status = U_INVALID_FORMAT_ERROR; 1.660 + } 1.661 + 1.662 + src->reorderCodesLength = codeCount; 1.663 + src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); 1.664 + current = src->current; 1.665 + 1.666 + // eat leading whitespace 1.667 + while(current < end && u_isWhitespace(*current)) { 1.668 + current++; 1.669 + } 1.670 + 1.671 + while(current < end) { 1.672 + space = u_memchr(current, 0x0020, end - current); 1.673 + space = space == 0 ? end : space; 1.674 + tokenLength = space - current; 1.675 + if (tokenLength < 4) { 1.676 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.677 + return; 1.678 + } else { 1.679 + u_UCharsToChars(current, conversion, tokenLength); 1.680 + conversion[tokenLength] = '\0'; 1.681 + src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); 1.682 + if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 1.683 + src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); 1.684 + } 1.685 + if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { 1.686 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.687 + } 1.688 + } 1.689 + codeIndex++; 1.690 + current += tokenLength; 1.691 + while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ 1.692 + ++current; 1.693 + } 1.694 + } 1.695 +} 1.696 + 1.697 +// reads and conforms to various options in rules 1.698 +// end is the position of the first closing ']' 1.699 +// However, some of the options take an UnicodeSet definition 1.700 +// which needs to duplicate the closing ']' 1.701 +// for example: '[copy [\uAC00-\uD7FF]]' 1.702 +// These options will move end to the second ']' and the 1.703 +// caller will set the current to it. 1.704 +static 1.705 +uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { 1.706 + const UChar* start = src->current; 1.707 + int32_t i = 0; 1.708 + int32_t j=0; 1.709 + const UChar *optionArg = NULL; 1.710 + 1.711 + uint8_t result = 0; 1.712 + 1.713 + start++; /*skip opening '['*/ 1.714 + i = ucol_uprv_tok_readOption(start, src->end, &optionArg); 1.715 + if(optionArg) { 1.716 + src->current = optionArg; 1.717 + } 1.718 + 1.719 + if(i < 0) { 1.720 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.721 + } else { 1.722 + int32_t noOpenBraces = 1; 1.723 + switch(i) { 1.724 + case OPTION_ALTERNATE_HANDLING: 1.725 + case OPTION_FRENCH_COLLATION: 1.726 + case OPTION_CASE_LEVEL: 1.727 + case OPTION_CASE_FIRST: 1.728 + case OPTION_NORMALIZATION_MODE: 1.729 + case OPTION_HIRAGANA_QUATERNARY: 1.730 + case OPTION_STRENGTH: 1.731 + case OPTION_NUMERIC_COLLATION: 1.732 + if(optionArg) { 1.733 + for(j = 0; j<rulesOptions[i].subSize; j++) { 1.734 + if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 1.735 + ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); 1.736 + result = UCOL_TOK_SUCCESS; 1.737 + } 1.738 + } 1.739 + } 1.740 + if(result == 0) { 1.741 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.742 + } 1.743 + break; 1.744 + case OPTION_VARIABLE_TOP: 1.745 + result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; 1.746 + break; 1.747 + case OPTION_REARRANGE: 1.748 + result = UCOL_TOK_SUCCESS; 1.749 + break; 1.750 + case OPTION_BEFORE: 1.751 + if(optionArg) { 1.752 + for(j = 0; j<rulesOptions[i].subSize; j++) { 1.753 + if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 1.754 + result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); 1.755 + } 1.756 + } 1.757 + } 1.758 + if(result == 0) { 1.759 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.760 + } 1.761 + break; 1.762 + case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ 1.763 + /* index to this array will be src->parsedToken.indirectIndex*/ 1.764 + src->parsedToken.indirectIndex = 0; 1.765 + result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; 1.766 + break; 1.767 + case OPTION_FIRST: 1.768 + case OPTION_LAST: /* first, last */ 1.769 + for(j = 0; j<rulesOptions[i].subSize; j++) { 1.770 + if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { 1.771 + // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first 1.772 + // element of indirect boundaries is reserved for top. 1.773 + src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); 1.774 + result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; 1.775 + } 1.776 + } 1.777 + if(result == 0) { 1.778 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.779 + } 1.780 + break; 1.781 + case OPTION_OPTIMIZE: 1.782 + case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization 1.783 + // we need to move end here 1.784 + src->current++; // skip opening brace 1.785 + while(src->current < src->end && noOpenBraces != 0) { 1.786 + if(*src->current == 0x005b) { 1.787 + noOpenBraces++; 1.788 + } else if(*src->current == 0x005D) { // closing brace 1.789 + noOpenBraces--; 1.790 + } 1.791 + src->current++; 1.792 + } 1.793 + result = UCOL_TOK_SUCCESS; 1.794 + break; 1.795 + case OPTION_SCRIPTREORDER: 1.796 + ucol_tok_parseScriptReorder(src, status); 1.797 + break; 1.798 + default: 1.799 + *status = U_UNSUPPORTED_ERROR; 1.800 + break; 1.801 + } 1.802 + } 1.803 + src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); 1.804 + return result; 1.805 +} 1.806 + 1.807 + 1.808 +inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { 1.809 + if (stuff == NULL || len <= 0) { 1.810 + return; 1.811 + } 1.812 + UnicodeString tempStuff(FALSE, stuff, len); 1.813 + if(src->extraCurrent+len >= src->extraEnd) { 1.814 + /* reallocate */ 1.815 + if (stuff >= src->source && stuff <= src->end) { 1.816 + // Copy the "stuff" contents into tempStuff's own buffer. 1.817 + // UnicodeString is copy-on-write. 1.818 + if (len > 0) { 1.819 + tempStuff.setCharAt(0, tempStuff[0]); 1.820 + } else { 1.821 + tempStuff.remove(); 1.822 + } 1.823 + } 1.824 + UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); 1.825 + if(newSrc != NULL) { 1.826 + src->current = newSrc + (src->current - src->source); 1.827 + src->extraCurrent = newSrc + (src->extraCurrent - src->source); 1.828 + src->end = newSrc + (src->end - src->source); 1.829 + src->extraEnd = newSrc + (src->extraEnd-src->source)*2; 1.830 + src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); 1.831 + src->source = newSrc; 1.832 + } else { 1.833 + *status = U_MEMORY_ALLOCATION_ERROR; 1.834 + return; 1.835 + } 1.836 + } 1.837 + if(len == 1) { 1.838 + *src->extraCurrent++ = tempStuff[0]; 1.839 + } else { 1.840 + u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); 1.841 + src->extraCurrent += len; 1.842 + } 1.843 +} 1.844 + 1.845 +inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { 1.846 + /* 1.847 + top = TRUE; 1.848 + */ 1.849 + UChar buff[5]; 1.850 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.851 + buff[0] = 0xFFFE; 1.852 + buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); 1.853 + buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); 1.854 + if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { 1.855 + src->parsedToken.charsLen = 3; 1.856 + ucol_tok_addToExtraCurrent(src, buff, 3, status); 1.857 + } else { 1.858 + buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); 1.859 + buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); 1.860 + src->parsedToken.charsLen = 5; 1.861 + ucol_tok_addToExtraCurrent(src, buff, 5, status); 1.862 + } 1.863 + return TRUE; 1.864 +} 1.865 + 1.866 +static UBool isCharNewLine(UChar c){ 1.867 + switch(c){ 1.868 + case 0x000A: /* LF */ 1.869 + case 0x000D: /* CR */ 1.870 + case 0x000C: /* FF */ 1.871 + case 0x0085: /* NEL */ 1.872 + case 0x2028: /* LS */ 1.873 + case 0x2029: /* PS */ 1.874 + return TRUE; 1.875 + default: 1.876 + return FALSE; 1.877 + } 1.878 +} 1.879 + 1.880 +/* 1.881 + * This function is called several times when a range is processed. Each time, the next code point 1.882 + * is processed. 1.883 + * The following variables must be set before calling this function: 1.884 + * src->currentRangeCp: The current code point to process. 1.885 + * src->lastRangeCp: The last code point in the range. 1.886 + * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. 1.887 + */ 1.888 +static const UChar* 1.889 +ucol_tok_processNextCodePointInRange(UColTokenParser *src, 1.890 + UErrorCode *status) 1.891 +{ 1.892 + // Append current code point to source 1.893 + UChar buff[U16_MAX_LENGTH]; 1.894 + uint32_t i = 0; 1.895 + 1.896 + uint32_t nChars = U16_LENGTH(src->currentRangeCp); 1.897 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.898 + src->parsedToken.charsLen = nChars; 1.899 + 1.900 + U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); 1.901 + ucol_tok_addToExtraCurrent(src, buff, nChars, status); 1.902 + 1.903 + ++src->currentRangeCp; 1.904 + if (src->currentRangeCp > src->lastRangeCp) { 1.905 + src->inRange = FALSE; 1.906 + 1.907 + if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 1.908 + src->isStarred = FALSE; 1.909 + } 1.910 + } else { 1.911 + src->previousCp = src->currentRangeCp; 1.912 + } 1.913 + return src->current; 1.914 +} 1.915 + 1.916 +/* 1.917 + * This function is called several times when a starred list is processed. Each time, the next code point 1.918 + * in the list is processed. 1.919 + * The following variables must be set before calling this function: 1.920 + * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. 1.921 + * src->lastStarredCharIndex: Index to the last character in the list. 1.922 + * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. 1.923 + */ 1.924 +static const UChar* 1.925 +ucol_tok_processNextTokenInStarredList(UColTokenParser *src) 1.926 +{ 1.927 + // Extract the characters corresponding to the next code point. 1.928 + UChar32 cp; 1.929 + src->parsedToken.charsOffset = src->currentStarredCharIndex; 1.930 + int32_t prev = src->currentStarredCharIndex; 1.931 + U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); 1.932 + src->parsedToken.charsLen = src->currentStarredCharIndex - prev; 1.933 + 1.934 + // When we are done parsing the starred string, turn the flag off so that 1.935 + // the normal processing is restored. 1.936 + if (src->currentStarredCharIndex > src->lastStarredCharIndex) { 1.937 + src->isStarred = FALSE; 1.938 + } 1.939 + src->previousCp = cp; 1.940 + return src->current; 1.941 +} 1.942 + 1.943 +/* 1.944 + * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. 1.945 + * 1.946 + * This routine parses and separates almost all tokens. The following are the syntax characters recognized. 1.947 + * # : Comment character 1.948 + * & : Reset operator 1.949 + * = : Equality 1.950 + * < : Primary collation 1.951 + * << : Secondary collation 1.952 + * <<< : Tertiary collation 1.953 + * ; : Secondary collation 1.954 + * , : Tertiary collation 1.955 + * / : Expansions 1.956 + * | : Prefix 1.957 + * - : Range 1.958 + 1.959 + * ! : Java Thai modifier, ignored 1.960 + * @ : French only 1.961 + 1.962 + * [] : Options 1.963 + * '' : Quotes 1.964 + * 1.965 + * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz 1.966 + * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. 1.967 + * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", 1.968 + * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous 1.969 + * character returned as cached so that the calling program can do further splitting. 1.970 + */ 1.971 +static const UChar* 1.972 +ucol_tok_parseNextTokenInternal(UColTokenParser *src, 1.973 + UBool startOfRules, 1.974 + UParseError *parseError, 1.975 + UErrorCode *status) 1.976 +{ 1.977 + UBool variableTop = FALSE; 1.978 + UBool top = FALSE; 1.979 + UBool inChars = TRUE; 1.980 + UBool inQuote = FALSE; 1.981 + UBool wasInQuote = FALSE; 1.982 + uint8_t before = 0; 1.983 + UBool isEscaped = FALSE; 1.984 + 1.985 + // TODO: replace these variables with src->parsedToken counterparts 1.986 + // no need to use them anymore since we have src->parsedToken. 1.987 + // Ideally, token parser would be a nice class... Once, when I have 1.988 + // more time (around 2020 probably). 1.989 + uint32_t newExtensionLen = 0; 1.990 + uint32_t extensionOffset = 0; 1.991 + uint32_t newStrength = UCOL_TOK_UNSET; 1.992 + UChar buff[10]; 1.993 + 1.994 + src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; 1.995 + src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; 1.996 + src->parsedToken.indirectIndex = 0; 1.997 + 1.998 + while (src->current < src->end) { 1.999 + UChar ch = *(src->current); 1.1000 + 1.1001 + if (inQuote) { 1.1002 + if (ch == 0x0027/*'\''*/) { 1.1003 + inQuote = FALSE; 1.1004 + } else { 1.1005 + if ((src->parsedToken.charsLen == 0) || inChars) { 1.1006 + if(src->parsedToken.charsLen == 0) { 1.1007 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.1008 + } 1.1009 + src->parsedToken.charsLen++; 1.1010 + } else { 1.1011 + if(newExtensionLen == 0) { 1.1012 + extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1.1013 + } 1.1014 + newExtensionLen++; 1.1015 + } 1.1016 + } 1.1017 + }else if(isEscaped){ 1.1018 + isEscaped =FALSE; 1.1019 + if (newStrength == UCOL_TOK_UNSET) { 1.1020 + *status = U_INVALID_FORMAT_ERROR; 1.1021 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1022 + DBG_FORMAT_ERROR 1.1023 + return NULL; 1.1024 + // enabling rules to start with non-tokens a < b 1.1025 + // newStrength = UCOL_TOK_RESET; 1.1026 + } 1.1027 + if(ch != 0x0000 && src->current != src->end) { 1.1028 + if (inChars) { 1.1029 + if(src->parsedToken.charsLen == 0) { 1.1030 + src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1.1031 + } 1.1032 + src->parsedToken.charsLen++; 1.1033 + } else { 1.1034 + if(newExtensionLen == 0) { 1.1035 + extensionOffset = (uint32_t)(src->current - src->source); 1.1036 + } 1.1037 + newExtensionLen++; 1.1038 + } 1.1039 + } 1.1040 + }else { 1.1041 + if(!PatternProps::isWhiteSpace(ch)) { 1.1042 + /* Sets the strength for this entry */ 1.1043 + switch (ch) { 1.1044 + case 0x003D/*'='*/ : 1.1045 + if (newStrength != UCOL_TOK_UNSET) { 1.1046 + goto EndOfLoop; 1.1047 + } 1.1048 + 1.1049 + /* if we start with strength, we'll reset to top */ 1.1050 + if(startOfRules == TRUE) { 1.1051 + src->parsedToken.indirectIndex = 5; 1.1052 + top = ucol_tok_doSetTop(src, status); 1.1053 + newStrength = UCOL_TOK_RESET; 1.1054 + goto EndOfLoop; 1.1055 + } 1.1056 + newStrength = UCOL_IDENTICAL; 1.1057 + if(*(src->current+1) == 0x002A) {/*'*'*/ 1.1058 + src->current++; 1.1059 + src->isStarred = TRUE; 1.1060 + } 1.1061 + break; 1.1062 + 1.1063 + case 0x002C/*','*/: 1.1064 + if (newStrength != UCOL_TOK_UNSET) { 1.1065 + goto EndOfLoop; 1.1066 + } 1.1067 + 1.1068 + /* if we start with strength, we'll reset to top */ 1.1069 + if(startOfRules == TRUE) { 1.1070 + src->parsedToken.indirectIndex = 5; 1.1071 + top = ucol_tok_doSetTop(src, status); 1.1072 + newStrength = UCOL_TOK_RESET; 1.1073 + goto EndOfLoop; 1.1074 + } 1.1075 + newStrength = UCOL_TERTIARY; 1.1076 + break; 1.1077 + 1.1078 + case 0x003B/*';'*/: 1.1079 + if (newStrength != UCOL_TOK_UNSET) { 1.1080 + goto EndOfLoop; 1.1081 + } 1.1082 + 1.1083 + /* if we start with strength, we'll reset to top */ 1.1084 + if(startOfRules == TRUE) { 1.1085 + src->parsedToken.indirectIndex = 5; 1.1086 + top = ucol_tok_doSetTop(src, status); 1.1087 + newStrength = UCOL_TOK_RESET; 1.1088 + goto EndOfLoop; 1.1089 + } 1.1090 + newStrength = UCOL_SECONDARY; 1.1091 + break; 1.1092 + 1.1093 + case 0x003C/*'<'*/: 1.1094 + if (newStrength != UCOL_TOK_UNSET) { 1.1095 + goto EndOfLoop; 1.1096 + } 1.1097 + 1.1098 + /* if we start with strength, we'll reset to top */ 1.1099 + if(startOfRules == TRUE) { 1.1100 + src->parsedToken.indirectIndex = 5; 1.1101 + top = ucol_tok_doSetTop(src, status); 1.1102 + newStrength = UCOL_TOK_RESET; 1.1103 + goto EndOfLoop; 1.1104 + } 1.1105 + /* before this, do a scan to verify whether this is */ 1.1106 + /* another strength */ 1.1107 + if(*(src->current+1) == 0x003C) { 1.1108 + src->current++; 1.1109 + if(*(src->current+1) == 0x003C) { 1.1110 + src->current++; /* three in a row! */ 1.1111 + newStrength = UCOL_TERTIARY; 1.1112 + } else { /* two in a row */ 1.1113 + newStrength = UCOL_SECONDARY; 1.1114 + } 1.1115 + } else { /* just one */ 1.1116 + newStrength = UCOL_PRIMARY; 1.1117 + } 1.1118 + if(*(src->current+1) == 0x002A) {/*'*'*/ 1.1119 + src->current++; 1.1120 + src->isStarred = TRUE; 1.1121 + } 1.1122 + break; 1.1123 + 1.1124 + case 0x0026/*'&'*/: 1.1125 + if (newStrength != UCOL_TOK_UNSET) { 1.1126 + /**/ 1.1127 + goto EndOfLoop; 1.1128 + } 1.1129 + 1.1130 + newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ 1.1131 + break; 1.1132 + 1.1133 + case 0x005b/*'['*/: 1.1134 + /* options - read an option, analyze it */ 1.1135 + if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { 1.1136 + uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); 1.1137 + if(U_SUCCESS(*status)) { 1.1138 + if(result & UCOL_TOK_TOP) { 1.1139 + if(newStrength == UCOL_TOK_RESET) { 1.1140 + top = ucol_tok_doSetTop(src, status); 1.1141 + if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' 1.1142 + src->parsedToken.charsLen+=2; 1.1143 + buff[0] = 0x002d; 1.1144 + buff[1] = before; 1.1145 + ucol_tok_addToExtraCurrent(src, buff, 2, status); 1.1146 + } 1.1147 + 1.1148 + src->current++; 1.1149 + goto EndOfLoop; 1.1150 + } else { 1.1151 + *status = U_INVALID_FORMAT_ERROR; 1.1152 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1153 + DBG_FORMAT_ERROR 1.1154 + } 1.1155 + } else if(result & UCOL_TOK_VARIABLE_TOP) { 1.1156 + if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { 1.1157 + variableTop = TRUE; 1.1158 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.1159 + src->parsedToken.charsLen = 1; 1.1160 + buff[0] = 0xFFFF; 1.1161 + ucol_tok_addToExtraCurrent(src, buff, 1, status); 1.1162 + src->current++; 1.1163 + goto EndOfLoop; 1.1164 + } else { 1.1165 + *status = U_INVALID_FORMAT_ERROR; 1.1166 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1167 + DBG_FORMAT_ERROR 1.1168 + } 1.1169 + } else if (result & UCOL_TOK_BEFORE){ 1.1170 + if(newStrength == UCOL_TOK_RESET) { 1.1171 + before = result & UCOL_TOK_BEFORE; 1.1172 + } else { 1.1173 + *status = U_INVALID_FORMAT_ERROR; 1.1174 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1175 + DBG_FORMAT_ERROR 1.1176 + } 1.1177 + } 1.1178 + } else { 1.1179 + *status = U_INVALID_FORMAT_ERROR; 1.1180 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1181 + DBG_FORMAT_ERROR 1.1182 + return NULL; 1.1183 + } 1.1184 + } 1.1185 + break; 1.1186 + case 0x0021/*! skip java thai modifier reordering*/: 1.1187 + break; 1.1188 + case 0x002F/*'/'*/: 1.1189 + wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ 1.1190 + inChars = FALSE; /* we're now processing expansion */ 1.1191 + break; 1.1192 + case 0x005C /* back slash for escaped chars */: 1.1193 + isEscaped = TRUE; 1.1194 + break; 1.1195 + /* found a quote, we're gonna start copying */ 1.1196 + case 0x0027/*'\''*/: 1.1197 + if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ 1.1198 + *status = U_INVALID_FORMAT_ERROR; 1.1199 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1200 + DBG_FORMAT_ERROR 1.1201 + return NULL; 1.1202 + // enabling rules to start with a non-token character a < b 1.1203 + // newStrength = UCOL_TOK_RESET; 1.1204 + } 1.1205 + 1.1206 + inQuote = TRUE; 1.1207 + 1.1208 + if(inChars) { /* we're doing characters */ 1.1209 + if(wasInQuote == FALSE) { 1.1210 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.1211 + } 1.1212 + if (src->parsedToken.charsLen != 0) { 1.1213 + ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1.1214 + } 1.1215 + src->parsedToken.charsLen++; 1.1216 + } else { /* we're doing an expansion */ 1.1217 + if(wasInQuote == FALSE) { 1.1218 + extensionOffset = (uint32_t)(src->extraCurrent - src->source); 1.1219 + } 1.1220 + if (newExtensionLen != 0) { 1.1221 + ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); 1.1222 + } 1.1223 + newExtensionLen++; 1.1224 + } 1.1225 + 1.1226 + wasInQuote = TRUE; 1.1227 + 1.1228 + ch = *(++(src->current)); 1.1229 + if(ch == 0x0027) { /* copy the double quote */ 1.1230 + ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1.1231 + inQuote = FALSE; 1.1232 + } 1.1233 + break; 1.1234 + 1.1235 + /* '@' is french only if the strength is not currently set */ 1.1236 + /* if it is, it's just a regular character in collation rules */ 1.1237 + case 0x0040/*'@'*/: 1.1238 + if (newStrength == UCOL_TOK_UNSET) { 1.1239 + src->opts->frenchCollation = UCOL_ON; 1.1240 + break; 1.1241 + } 1.1242 + 1.1243 + case 0x007C /*|*/: /* this means we have actually been reading prefix part */ 1.1244 + // we want to store read characters to the prefix part and continue reading 1.1245 + // the characters (proper way would be to restart reading the chars, but in 1.1246 + // that case we would have to complicate the token hasher, which I do not 1.1247 + // intend to play with. Instead, we will do prefixes when prefixes are due 1.1248 + // (before adding the elements). 1.1249 + src->parsedToken.prefixOffset = src->parsedToken.charsOffset; 1.1250 + src->parsedToken.prefixLen = src->parsedToken.charsLen; 1.1251 + 1.1252 + if(inChars) { /* we're doing characters */ 1.1253 + if(wasInQuote == FALSE) { 1.1254 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.1255 + } 1.1256 + if (src->parsedToken.charsLen != 0) { 1.1257 + ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); 1.1258 + } 1.1259 + src->parsedToken.charsLen++; 1.1260 + } 1.1261 + 1.1262 + wasInQuote = TRUE; 1.1263 + 1.1264 + do { 1.1265 + ch = *(++(src->current)); 1.1266 + // skip whitespace between '|' and the character 1.1267 + } while (PatternProps::isWhiteSpace(ch)); 1.1268 + break; 1.1269 + 1.1270 + //charsOffset = 0; 1.1271 + //newCharsLen = 0; 1.1272 + //break; // We want to store the whole prefix/character sequence. If we break 1.1273 + // the '|' is going to get lost. 1.1274 + 1.1275 + case 0x002D /*-*/: /* A range. */ 1.1276 + if (newStrength != UCOL_TOK_UNSET) { 1.1277 + // While processing the pending token, the isStarred field 1.1278 + // is reset, so it needs to be saved for the next 1.1279 + // invocation. 1.1280 + src->savedIsStarred = src->isStarred; 1.1281 + goto EndOfLoop; 1.1282 + } 1.1283 + src->isStarred = src->savedIsStarred; 1.1284 + 1.1285 + // Ranges are valid only in starred tokens. 1.1286 + if (!src->isStarred) { 1.1287 + *status = U_INVALID_FORMAT_ERROR; 1.1288 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1289 + DBG_FORMAT_ERROR 1.1290 + return NULL; 1.1291 + } 1.1292 + newStrength = src->parsedToken.strength; 1.1293 + src->inRange = TRUE; 1.1294 + break; 1.1295 + 1.1296 + case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ 1.1297 + do { 1.1298 + ch = *(++(src->current)); 1.1299 + } while (!isCharNewLine(ch)); 1.1300 + 1.1301 + break; 1.1302 + default: 1.1303 + if (newStrength == UCOL_TOK_UNSET) { 1.1304 + *status = U_INVALID_FORMAT_ERROR; 1.1305 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1306 + DBG_FORMAT_ERROR 1.1307 + return NULL; 1.1308 + } 1.1309 + 1.1310 + if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { 1.1311 + *status = U_INVALID_FORMAT_ERROR; 1.1312 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1313 + DBG_FORMAT_ERROR 1.1314 + return NULL; 1.1315 + } 1.1316 + 1.1317 + if(ch == 0x0000 && src->current+1 == src->end) { 1.1318 + break; 1.1319 + } 1.1320 + 1.1321 + if (inChars) { 1.1322 + if(src->parsedToken.charsLen == 0) { 1.1323 + src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); 1.1324 + } 1.1325 + src->parsedToken.charsLen++; 1.1326 + } else { 1.1327 + if(newExtensionLen == 0) { 1.1328 + extensionOffset = (uint32_t)(src->current - src->source); 1.1329 + } 1.1330 + newExtensionLen++; 1.1331 + } 1.1332 + 1.1333 + break; 1.1334 + } 1.1335 + } 1.1336 + } 1.1337 + 1.1338 + if(wasInQuote) { 1.1339 + if(ch != 0x27) { 1.1340 + if(inQuote || !PatternProps::isWhiteSpace(ch)) { 1.1341 + ucol_tok_addToExtraCurrent(src, &ch, 1, status); 1.1342 + } 1.1343 + } 1.1344 + } 1.1345 + 1.1346 + src->current++; 1.1347 + } 1.1348 + 1.1349 +EndOfLoop: 1.1350 + wasInQuote = FALSE; 1.1351 + if (newStrength == UCOL_TOK_UNSET) { 1.1352 + return NULL; 1.1353 + } 1.1354 + 1.1355 + if (src->parsedToken.charsLen == 0 && top == FALSE) { 1.1356 + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); 1.1357 + *status = U_INVALID_FORMAT_ERROR; 1.1358 + DBG_FORMAT_ERROR 1.1359 + return NULL; 1.1360 + } 1.1361 + 1.1362 + src->parsedToken.strength = newStrength; 1.1363 + src->parsedToken.extensionOffset = extensionOffset; 1.1364 + src->parsedToken.extensionLen = newExtensionLen; 1.1365 + src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; 1.1366 + 1.1367 + return src->current; 1.1368 +} 1.1369 + 1.1370 +/* 1.1371 + * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. 1.1372 + * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. 1.1373 + * 1.1374 + * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: 1.1375 + * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates 1.1376 + * it to separate tokens and returns one by one. In order to do that, the necessary states are 1.1377 + * cached as member variables of the token parser. 1.1378 + * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the 1.1379 + * starting character as a single list token (which is separated into individual characters here) 1.1380 + * and as another list token starting with the last character in the range. Before expanding it 1.1381 + * as a list of tokens, this function expands the range by filling the intermediate characters and 1.1382 + * returns them one by one as separate tokens. 1.1383 + * Necessary checks are done for invalid combinations. 1.1384 + */ 1.1385 +U_CAPI const UChar* U_EXPORT2 1.1386 +ucol_tok_parseNextToken(UColTokenParser *src, 1.1387 + UBool startOfRules, 1.1388 + UParseError *parseError, 1.1389 + UErrorCode *status) 1.1390 +{ 1.1391 + const UChar *nextToken; 1.1392 + 1.1393 + if (src->inRange) { 1.1394 + // We are not done processing a range. Continue it. 1.1395 + return ucol_tok_processNextCodePointInRange(src, status); 1.1396 + } else if (src->isStarred) { 1.1397 + // We are not done processing a starred token. Continue it. 1.1398 + return ucol_tok_processNextTokenInStarredList(src); 1.1399 + } 1.1400 + 1.1401 + // Get the next token. 1.1402 + nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); 1.1403 + 1.1404 + if (nextToken == NULL) { 1.1405 + return NULL; 1.1406 + } 1.1407 + 1.1408 + if (src->inRange) { 1.1409 + // A new range has started. 1.1410 + // Check whether it is a chain of ranges with more than one hyphen. 1.1411 + if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { 1.1412 + *status = U_INVALID_FORMAT_ERROR; 1.1413 + syntaxError(src->source,src->parsedToken.charsOffset-1, 1.1414 + src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); 1.1415 + DBG_FORMAT_ERROR 1.1416 + return NULL; 1.1417 + } 1.1418 + 1.1419 + // The current token indicates the second code point of the range. 1.1420 + // Process just that, and then proceed with the star. 1.1421 + src->currentStarredCharIndex = src->parsedToken.charsOffset; 1.1422 + U16_NEXT(src->source, src->currentStarredCharIndex, 1.1423 + (uint32_t)(src->end - src->source), src->lastRangeCp); 1.1424 + if (src->lastRangeCp <= src->previousCp) { 1.1425 + *status = U_INVALID_FORMAT_ERROR; 1.1426 + syntaxError(src->source,src->parsedToken.charsOffset-1, 1.1427 + src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1.1428 + DBG_FORMAT_ERROR 1.1429 + return NULL; 1.1430 + } 1.1431 + 1.1432 + // Set current range code point to process the range loop 1.1433 + src->currentRangeCp = src->previousCp + 1; 1.1434 + 1.1435 + src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1.1436 + 1.1437 + return ucol_tok_processNextCodePointInRange(src, status); 1.1438 + } else if (src->isStarred) { 1.1439 + // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that 1.1440 + // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be 1.1441 + // separated into several tokens and returned. 1.1442 + src->currentStarredCharIndex = src->parsedToken.charsOffset; 1.1443 + src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; 1.1444 + 1.1445 + return ucol_tok_processNextTokenInStarredList(src); 1.1446 + } else { 1.1447 + // Set previous codepoint 1.1448 + U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); 1.1449 + } 1.1450 + return nextToken; 1.1451 +} 1.1452 + 1.1453 + 1.1454 +/* 1.1455 +Processing Description 1.1456 +1 Build a ListList. Each list has a header, which contains two lists (positive 1.1457 +and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and 1.1458 +reset may be null. 1.1459 +2 As you process, you keep a LAST pointer that points to the last token you 1.1460 +handled. 1.1461 + 1.1462 +*/ 1.1463 + 1.1464 +static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, 1.1465 + UParseError *parseError, UErrorCode *status) 1.1466 +{ 1.1467 + if(src->resultLen == src->listCapacity) { 1.1468 + // Unfortunately, this won't work, as we store addresses of lhs in token 1.1469 + src->listCapacity *= 2; 1.1470 + src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); 1.1471 + if(src->lh == NULL) { 1.1472 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1473 + return NULL; 1.1474 + } 1.1475 + } 1.1476 + /* do the reset thing */ 1.1477 + UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1.1478 + /* test for NULL */ 1.1479 + if (sourceToken == NULL) { 1.1480 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1481 + return NULL; 1.1482 + } 1.1483 + sourceToken->rulesToParseHdl = &(src->source); 1.1484 + sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1.1485 + sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1.1486 + 1.1487 + sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1.1488 + sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1.1489 + 1.1490 + // keep the flags around so that we know about before 1.1491 + sourceToken->flags = src->parsedToken.flags; 1.1492 + 1.1493 + if(src->parsedToken.prefixOffset != 0) { 1.1494 + // this is a syntax error 1.1495 + *status = U_INVALID_FORMAT_ERROR; 1.1496 + syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); 1.1497 + DBG_FORMAT_ERROR 1.1498 + uprv_free(sourceToken); 1.1499 + return 0; 1.1500 + } else { 1.1501 + sourceToken->prefix = 0; 1.1502 + } 1.1503 + 1.1504 + sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1.1505 + sourceToken->strength = UCOL_TOK_RESET; 1.1506 + sourceToken->next = NULL; 1.1507 + sourceToken->previous = NULL; 1.1508 + sourceToken->noOfCEs = 0; 1.1509 + sourceToken->noOfExpCEs = 0; 1.1510 + sourceToken->listHeader = &src->lh[src->resultLen]; 1.1511 + 1.1512 + src->lh[src->resultLen].first = NULL; 1.1513 + src->lh[src->resultLen].last = NULL; 1.1514 + src->lh[src->resultLen].first = NULL; 1.1515 + src->lh[src->resultLen].last = NULL; 1.1516 + 1.1517 + src->lh[src->resultLen].reset = sourceToken; 1.1518 + 1.1519 + /* 1.1520 + 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 1.1521 + First convert all expansions into normal form. Examples: 1.1522 + If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 1.1523 + d * ... into &x * c/y * d * ... 1.1524 + Note: reset values can never have expansions, although they can cause the 1.1525 + very next item to have one. They may be contractions, if they are found 1.1526 + earlier in the list. 1.1527 + */ 1.1528 + *expandNext = 0; 1.1529 + if(expand != NULL) { 1.1530 + /* check to see if there is an expansion */ 1.1531 + if(src->parsedToken.charsLen > 1) { 1.1532 + uint32_t resetCharsOffset; 1.1533 + resetCharsOffset = (uint32_t)(expand - src->source); 1.1534 + sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; 1.1535 + *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); 1.1536 + } 1.1537 + } 1.1538 + 1.1539 + src->resultLen++; 1.1540 + 1.1541 + uhash_put(src->tailored, sourceToken, sourceToken, status); 1.1542 + 1.1543 + return sourceToken; 1.1544 +} 1.1545 + 1.1546 +static 1.1547 +inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { 1.1548 + if(U_FAILURE(*status)) { 1.1549 + return NULL; 1.1550 + } 1.1551 + /* this is a virgin before - we need to fish the anchor from the UCA */ 1.1552 + collIterate s; 1.1553 + uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; 1.1554 + uint32_t CE, SecondCE; 1.1555 + // uint32_t invPos; 1.1556 + if(sourceToken != NULL) { 1.1557 + uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); 1.1558 + } else { 1.1559 + uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); 1.1560 + } 1.1561 + if(U_FAILURE(*status)) { 1.1562 + return NULL; 1.1563 + } 1.1564 + 1.1565 + baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; 1.1566 + baseContCE = ucol_getNextCE(src->UCA, &s, status); 1.1567 + if(baseContCE == UCOL_NO_MORE_CES) { 1.1568 + baseContCE = 0; 1.1569 + } 1.1570 + 1.1571 + 1.1572 + UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1.1573 + uint32_t ch = 0; 1.1574 + uint32_t expandNext = 0; 1.1575 + UColToken key; 1.1576 + 1.1577 + if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 1.1578 + uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 1.1579 + uint32_t raw = uprv_uca_getRawFromImplicit(primary); 1.1580 + ch = uprv_uca_getCodePointFromRaw(raw-1); 1.1581 + uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 1.1582 + CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 1.1583 + SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 1.1584 + 1.1585 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); 1.1586 + *src->extraCurrent++ = 0xFFFE; 1.1587 + *src->extraCurrent++ = (UChar)ch; 1.1588 + src->parsedToken.charsLen++; 1.1589 + 1.1590 + key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1.1591 + key.rulesToParseHdl = &(src->source); 1.1592 + 1.1593 + //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1.1594 + sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1.1595 + 1.1596 + if(sourceToken == NULL) { 1.1597 + src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1.1598 + if(isContinuation(SecondCE)) { 1.1599 + src->lh[src->resultLen].baseContCE = SecondCE; 1.1600 + } else { 1.1601 + src->lh[src->resultLen].baseContCE = 0; 1.1602 + } 1.1603 + src->lh[src->resultLen].nextCE = 0; 1.1604 + src->lh[src->resultLen].nextContCE = 0; 1.1605 + src->lh[src->resultLen].previousCE = 0; 1.1606 + src->lh[src->resultLen].previousContCE = 0; 1.1607 + 1.1608 + src->lh[src->resultLen].indirect = FALSE; 1.1609 + 1.1610 + sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1.1611 + } 1.1612 + 1.1613 + } else { 1.1614 + /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 1.1615 + 1.1616 + // we got the previous CE. Now we need to see if the difference between 1.1617 + // the two CEs is really of the requested strength. 1.1618 + // if it's a bigger difference (we asked for secondary and got primary), we 1.1619 + // need to modify the CE. 1.1620 + if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { 1.1621 + // adjust the strength 1.1622 + // now we are in the situation where our baseCE should actually be modified in 1.1623 + // order to get the CE in the right position. 1.1624 + if(strength == UCOL_SECONDARY) { 1.1625 + CE = baseCE - 0x0200; 1.1626 + } else { // strength == UCOL_TERTIARY 1.1627 + CE = baseCE - 0x02; 1.1628 + } 1.1629 + if(baseContCE) { 1.1630 + if(strength == UCOL_SECONDARY) { 1.1631 + SecondCE = baseContCE - 0x0200; 1.1632 + } else { // strength == UCOL_TERTIARY 1.1633 + SecondCE = baseContCE - 0x02; 1.1634 + } 1.1635 + } 1.1636 + } 1.1637 + 1.1638 +#if 0 1.1639 + // the code below relies on getting a code point from the inverse table, in order to be 1.1640 + // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: 1.1641 + // 1. There are many code points that have the same CE 1.1642 + // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. 1.1643 + // Also, in case when there is no equivalent strength before an element, we have to actually 1.1644 + // construct one. For example, &[before 2]a << x won't result in x << a, because the element 1.1645 + // before a is a primary difference. 1.1646 + 1.1647 + //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 1.1648 + 1.1649 + 1.1650 + ch = CETable[3*invPos+2]; 1.1651 + 1.1652 + if((ch & UCOL_INV_SIZEMASK) != 0) { 1.1653 + uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); 1.1654 + uint32_t offset = (ch & UCOL_INV_OFFSETMASK); 1.1655 + ch = conts[offset]; 1.1656 + } 1.1657 + 1.1658 + *src->extraCurrent++ = (UChar)ch; 1.1659 + src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); 1.1660 + src->parsedToken.charsLen = 1; 1.1661 + 1.1662 + // We got an UCA before. However, this might have been tailored. 1.1663 + // example: 1.1664 + // &\u30ca = \u306a 1.1665 + // &[before 3]\u306a<<<\u306a|\u309d 1.1666 + 1.1667 + 1.1668 + // uint32_t key = (*newCharsLen << 24) | *charsOffset; 1.1669 + key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; 1.1670 + key.rulesToParseHdl = &(src->source); 1.1671 + 1.1672 + //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); 1.1673 + sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1.1674 +#endif 1.1675 + 1.1676 + // here is how it should be. The situation such as &[before 1]a < x, should be 1.1677 + // resolved exactly as if we wrote &a > x. 1.1678 + // therefore, I don't really care if the UCA value before a has been changed. 1.1679 + // However, I do care if the strength between my element and the previous element 1.1680 + // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll 1.1681 + // have to construct the base CE. 1.1682 + 1.1683 + 1.1684 + 1.1685 + // if we found a tailored thing, we have to use the UCA value and construct 1.1686 + // a new reset token with constructed name 1.1687 + //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1.1688 + // character to which we want to anchor is already tailored. 1.1689 + // We need to construct a new token which will be the anchor 1.1690 + // point 1.1691 + //*(src->extraCurrent-1) = 0xFFFE; 1.1692 + //*src->extraCurrent++ = (UChar)ch; 1.1693 + // grab before 1.1694 + src->parsedToken.charsOffset -= 10; 1.1695 + src->parsedToken.charsLen += 10; 1.1696 + src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1.1697 + if(isContinuation(SecondCE)) { 1.1698 + src->lh[src->resultLen].baseContCE = SecondCE; 1.1699 + } else { 1.1700 + src->lh[src->resultLen].baseContCE = 0; 1.1701 + } 1.1702 + src->lh[src->resultLen].nextCE = 0; 1.1703 + src->lh[src->resultLen].nextContCE = 0; 1.1704 + src->lh[src->resultLen].previousCE = 0; 1.1705 + src->lh[src->resultLen].previousContCE = 0; 1.1706 + 1.1707 + src->lh[src->resultLen].indirect = FALSE; 1.1708 + 1.1709 + sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1.1710 + //} 1.1711 + } 1.1712 + 1.1713 + return sourceToken; 1.1714 + 1.1715 +} 1.1716 + 1.1717 +uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { 1.1718 + UColToken *lastToken = NULL; 1.1719 + const UChar *parseEnd = NULL; 1.1720 + uint32_t expandNext = 0; 1.1721 + UBool variableTop = FALSE; 1.1722 + UBool top = FALSE; 1.1723 + uint16_t specs = 0; 1.1724 + UColTokListHeader *ListList = NULL; 1.1725 + 1.1726 + src->parsedToken.strength = UCOL_TOK_UNSET; 1.1727 + 1.1728 + ListList = src->lh; 1.1729 + 1.1730 + if(U_FAILURE(*status)) { 1.1731 + return 0; 1.1732 + } 1.1733 +#ifdef DEBUG_FOR_CODE_POINTS 1.1734 + char filename[35]; 1.1735 + sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); 1.1736 + dfcp_fp = fopen(filename, "a"); 1.1737 + fprintf(stdout, "Output is in the file %s.\n", filename); 1.1738 +#endif 1.1739 + 1.1740 +#ifdef DEBUG_FOR_COLL_RULES 1.1741 + std::string s3; 1.1742 + UnicodeString(src->source).toUTF8String(s3); 1.1743 + std::cout << "src->source = " << s3 << std::endl; 1.1744 +#endif 1.1745 + 1.1746 + while(src->current < src->end || src->isStarred) { 1.1747 + src->parsedToken.prefixOffset = 0; 1.1748 + 1.1749 + parseEnd = ucol_tok_parseNextToken(src, 1.1750 + (UBool)(lastToken == NULL), 1.1751 + parseError, 1.1752 + status); 1.1753 + 1.1754 + specs = src->parsedToken.flags; 1.1755 + 1.1756 + 1.1757 + variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); 1.1758 + top = ((specs & UCOL_TOK_TOP) != 0); 1.1759 + 1.1760 + if(U_SUCCESS(*status) && parseEnd != NULL) { 1.1761 + UColToken *sourceToken = NULL; 1.1762 + //uint32_t key = 0; 1.1763 + uint32_t lastStrength = UCOL_TOK_UNSET; 1.1764 + 1.1765 + if(lastToken != NULL ) { 1.1766 + lastStrength = lastToken->strength; 1.1767 + } 1.1768 + 1.1769 +#ifdef DEBUG_FOR_CODE_POINTS 1.1770 + UChar32 cp; 1.1771 + U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); 1.1772 + fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); 1.1773 +#endif 1.1774 + //key = newCharsLen << 24 | charsOffset; 1.1775 + UColToken key; 1.1776 + key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1.1777 + key.rulesToParseHdl = &(src->source); 1.1778 + 1.1779 + /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ 1.1780 + sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1.1781 + 1.1782 + if(src->parsedToken.strength != UCOL_TOK_RESET) { 1.1783 + if(lastToken == NULL) { /* this means that rules haven't started properly */ 1.1784 + *status = U_INVALID_FORMAT_ERROR; 1.1785 + syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1.1786 + DBG_FORMAT_ERROR 1.1787 + return 0; 1.1788 + } 1.1789 + /* 6 Otherwise (when relation != reset) */ 1.1790 + if(sourceToken == NULL) { 1.1791 + /* If sourceToken is null, create new one, */ 1.1792 + sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); 1.1793 + /* test for NULL */ 1.1794 + if (sourceToken == NULL) { 1.1795 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1796 + return 0; 1.1797 + } 1.1798 + sourceToken->rulesToParseHdl = &(src->source); 1.1799 + sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; 1.1800 + 1.1801 + sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); 1.1802 + 1.1803 + sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; 1.1804 + sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); 1.1805 + 1.1806 + sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ 1.1807 + sourceToken->next = NULL; 1.1808 + sourceToken->previous = NULL; 1.1809 + sourceToken->noOfCEs = 0; 1.1810 + sourceToken->noOfExpCEs = 0; 1.1811 + // keep the flags around so that we know about before 1.1812 + sourceToken->flags = src->parsedToken.flags; 1.1813 + uhash_put(src->tailored, sourceToken, sourceToken, status); 1.1814 + if(U_FAILURE(*status)) { 1.1815 + return 0; 1.1816 + } 1.1817 + } else { 1.1818 + /* we could have fished out a reset here */ 1.1819 + if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { 1.1820 + /* otherwise remove sourceToken from where it was. */ 1.1821 + if(sourceToken->next != NULL) { 1.1822 + if(sourceToken->next->strength > sourceToken->strength) { 1.1823 + sourceToken->next->strength = sourceToken->strength; 1.1824 + } 1.1825 + sourceToken->next->previous = sourceToken->previous; 1.1826 + } else { 1.1827 + sourceToken->listHeader->last = sourceToken->previous; 1.1828 + } 1.1829 + 1.1830 + if(sourceToken->previous != NULL) { 1.1831 + sourceToken->previous->next = sourceToken->next; 1.1832 + } else { 1.1833 + sourceToken->listHeader->first = sourceToken->next; 1.1834 + } 1.1835 + sourceToken->next = NULL; 1.1836 + sourceToken->previous = NULL; 1.1837 + } 1.1838 + } 1.1839 + 1.1840 + sourceToken->strength = src->parsedToken.strength; 1.1841 + sourceToken->listHeader = lastToken->listHeader; 1.1842 + 1.1843 + /* 1.1844 + 1. Find the strongest strength in each list, and set strongestP and strongestN 1.1845 + accordingly in the headers. 1.1846 + */ 1.1847 + if(lastStrength == UCOL_TOK_RESET 1.1848 + || sourceToken->listHeader->first == 0) { 1.1849 + /* If LAST is a reset 1.1850 + insert sourceToken in the list. */ 1.1851 + if(sourceToken->listHeader->first == 0) { 1.1852 + sourceToken->listHeader->first = sourceToken; 1.1853 + sourceToken->listHeader->last = sourceToken; 1.1854 + } else { /* we need to find a place for us */ 1.1855 + /* and we'll get in front of the same strength */ 1.1856 + if(sourceToken->listHeader->first->strength <= sourceToken->strength) { 1.1857 + sourceToken->next = sourceToken->listHeader->first; 1.1858 + sourceToken->next->previous = sourceToken; 1.1859 + sourceToken->listHeader->first = sourceToken; 1.1860 + sourceToken->previous = NULL; 1.1861 + } else { 1.1862 + lastToken = sourceToken->listHeader->first; 1.1863 + while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1.1864 + lastToken = lastToken->next; 1.1865 + } 1.1866 + if(lastToken->next != NULL) { 1.1867 + lastToken->next->previous = sourceToken; 1.1868 + } else { 1.1869 + sourceToken->listHeader->last = sourceToken; 1.1870 + } 1.1871 + sourceToken->previous = lastToken; 1.1872 + sourceToken->next = lastToken->next; 1.1873 + lastToken->next = sourceToken; 1.1874 + } 1.1875 + } 1.1876 + } else { 1.1877 + /* Otherwise (when LAST is not a reset) 1.1878 + if polarity (LAST) == polarity(relation), insert sourceToken after LAST, 1.1879 + otherwise insert before. 1.1880 + when inserting after or before, search to the next position with the same 1.1881 + strength in that direction. (This is called postpone insertion). */ 1.1882 + if(sourceToken != lastToken) { 1.1883 + if(lastToken->polarity == sourceToken->polarity) { 1.1884 + while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { 1.1885 + lastToken = lastToken->next; 1.1886 + } 1.1887 + sourceToken->previous = lastToken; 1.1888 + if(lastToken->next != NULL) { 1.1889 + lastToken->next->previous = sourceToken; 1.1890 + } else { 1.1891 + sourceToken->listHeader->last = sourceToken; 1.1892 + } 1.1893 + 1.1894 + sourceToken->next = lastToken->next; 1.1895 + lastToken->next = sourceToken; 1.1896 + } else { 1.1897 + while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { 1.1898 + lastToken = lastToken->previous; 1.1899 + } 1.1900 + sourceToken->next = lastToken; 1.1901 + if(lastToken->previous != NULL) { 1.1902 + lastToken->previous->next = sourceToken; 1.1903 + } else { 1.1904 + sourceToken->listHeader->first = sourceToken; 1.1905 + } 1.1906 + sourceToken->previous = lastToken->previous; 1.1907 + lastToken->previous = sourceToken; 1.1908 + } 1.1909 + } else { /* repeated one thing twice in rules, stay with the stronger strength */ 1.1910 + if(lastStrength < sourceToken->strength) { 1.1911 + sourceToken->strength = lastStrength; 1.1912 + } 1.1913 + } 1.1914 + } 1.1915 + 1.1916 + /* if the token was a variable top, we're gonna put it in */ 1.1917 + if(variableTop == TRUE && src->varTop == NULL) { 1.1918 + variableTop = FALSE; 1.1919 + src->varTop = sourceToken; 1.1920 + } 1.1921 + 1.1922 + // Treat the expansions. 1.1923 + // There are two types of expansions: explicit (x / y) and reset based propagating expansions 1.1924 + // (&abc * d * e <=> &ab * d / c * e / c) 1.1925 + // if both of them are in effect for a token, they are combined. 1.1926 + 1.1927 + sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; 1.1928 + 1.1929 + if(expandNext != 0) { 1.1930 + if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ 1.1931 + expandNext = 0; 1.1932 + } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ 1.1933 + sourceToken->expansion = expandNext; 1.1934 + } else { /* there is both explicit and implicit expansion. We need to make a combination */ 1.1935 + uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); 1.1936 + uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); 1.1937 + sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); 1.1938 + src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; 1.1939 + } 1.1940 + } 1.1941 + 1.1942 + // This is just for debugging purposes 1.1943 + if(sourceToken->expansion != 0) { 1.1944 + sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); 1.1945 + } else { 1.1946 + sourceToken->debugExpansion = 0; 1.1947 + } 1.1948 + // if the previous token was a reset before, the strength of this 1.1949 + // token must match the strength of before. Otherwise we have an 1.1950 + // undefined situation. 1.1951 + // In other words, we currently have a cludge which we use to 1.1952 + // represent &a >> x. This is written as &[before 2]a << x. 1.1953 + if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { 1.1954 + uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; 1.1955 + if(beforeStrength != sourceToken->strength) { 1.1956 + *status = U_INVALID_FORMAT_ERROR; 1.1957 + syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); 1.1958 + DBG_FORMAT_ERROR 1.1959 + return 0; 1.1960 + } 1.1961 + } 1.1962 + } else { 1.1963 + if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { 1.1964 + /* if the previous token was also a reset, */ 1.1965 + /*this means that we have two consecutive resets */ 1.1966 + /* and we want to remove the previous one if empty*/ 1.1967 + if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 1.1968 + src->resultLen--; 1.1969 + } 1.1970 + } 1.1971 + 1.1972 + if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ 1.1973 + uint32_t searchCharsLen = src->parsedToken.charsLen; 1.1974 + while(searchCharsLen > 1 && sourceToken == NULL) { 1.1975 + searchCharsLen--; 1.1976 + //key = searchCharsLen << 24 | charsOffset; 1.1977 + UColToken key; 1.1978 + key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; 1.1979 + key.rulesToParseHdl = &(src->source); 1.1980 + sourceToken = (UColToken *)uhash_get(src->tailored, &key); 1.1981 + } 1.1982 + if(sourceToken != NULL) { 1.1983 + expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); 1.1984 + } 1.1985 + } 1.1986 + 1.1987 + if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ 1.1988 + if(top == FALSE) { /* there is no indirection */ 1.1989 + uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 1.1990 + if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { 1.1991 + /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ 1.1992 + while(sourceToken->strength > strength && sourceToken->previous != NULL) { 1.1993 + sourceToken = sourceToken->previous; 1.1994 + } 1.1995 + /* here, either we hit the strength or NULL */ 1.1996 + if(sourceToken->strength == strength) { 1.1997 + if(sourceToken->previous != NULL) { 1.1998 + sourceToken = sourceToken->previous; 1.1999 + } else { /* start of list */ 1.2000 + sourceToken = sourceToken->listHeader->reset; 1.2001 + } 1.2002 + } else { /* we hit NULL */ 1.2003 + /* we should be doing the else part */ 1.2004 + sourceToken = sourceToken->listHeader->reset; 1.2005 + sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 1.2006 + } 1.2007 + } else { 1.2008 + sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); 1.2009 + } 1.2010 + } else { /* this is both before and indirection */ 1.2011 + top = FALSE; 1.2012 + ListList[src->resultLen].previousCE = 0; 1.2013 + ListList[src->resultLen].previousContCE = 0; 1.2014 + ListList[src->resultLen].indirect = TRUE; 1.2015 + /* we need to do slightly more work. we need to get the baseCE using the */ 1.2016 + /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ 1.2017 + /* in ucol_bld */ 1.2018 + uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; 1.2019 + uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 1.2020 + uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; 1.2021 + uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 1.2022 + 1.2023 + UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1.2024 + if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && 1.2025 + (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 1.2026 + uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); 1.2027 + uint32_t raw = uprv_uca_getRawFromImplicit(primary); 1.2028 + uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); 1.2029 + CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 1.2030 + SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; 1.2031 + } else { 1.2032 + /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ 1.2033 + ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); 1.2034 + } 1.2035 + 1.2036 + ListList[src->resultLen].baseCE = CE; 1.2037 + ListList[src->resultLen].baseContCE = SecondCE; 1.2038 + ListList[src->resultLen].nextCE = 0; 1.2039 + ListList[src->resultLen].nextContCE = 0; 1.2040 + 1.2041 + sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1.2042 + } 1.2043 + } 1.2044 + 1.2045 + 1.2046 + /* 5 If the relation is a reset: 1.2047 + If sourceToken is null 1.2048 + Create new list, create new sourceToken, make the baseCE from source, put 1.2049 + the sourceToken in ListHeader of the new list */ 1.2050 + if(sourceToken == NULL) { 1.2051 + /* 1.2052 + 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... 1.2053 + First convert all expansions into normal form. Examples: 1.2054 + If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 1.2055 + d * ... into &x * c/y * d * ... 1.2056 + Note: reset values can never have expansions, although they can cause the 1.2057 + very next item to have one. They may be contractions, if they are found 1.2058 + earlier in the list. 1.2059 + */ 1.2060 + if(top == FALSE) { 1.2061 + collIterate s; 1.2062 + uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; 1.2063 + 1.2064 + uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); 1.2065 + 1.2066 + CE = ucol_getNextCE(src->UCA, &s, status); 1.2067 + const UChar *expand = s.pos; 1.2068 + SecondCE = ucol_getNextCE(src->UCA, &s, status); 1.2069 + 1.2070 + ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; 1.2071 + if(isContinuation(SecondCE)) { 1.2072 + ListList[src->resultLen].baseContCE = SecondCE; 1.2073 + } else { 1.2074 + ListList[src->resultLen].baseContCE = 0; 1.2075 + } 1.2076 + ListList[src->resultLen].nextCE = 0; 1.2077 + ListList[src->resultLen].nextContCE = 0; 1.2078 + ListList[src->resultLen].previousCE = 0; 1.2079 + ListList[src->resultLen].previousContCE = 0; 1.2080 + ListList[src->resultLen].indirect = FALSE; 1.2081 + sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); 1.2082 + } else { /* top == TRUE */ 1.2083 + /* just use the supplied values */ 1.2084 + top = FALSE; 1.2085 + ListList[src->resultLen].previousCE = 0; 1.2086 + ListList[src->resultLen].previousContCE = 0; 1.2087 + ListList[src->resultLen].indirect = TRUE; 1.2088 + ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; 1.2089 + ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; 1.2090 + ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; 1.2091 + ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; 1.2092 + 1.2093 + sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); 1.2094 + 1.2095 + } 1.2096 + } else { /* reset to something already in rules */ 1.2097 + top = FALSE; 1.2098 + } 1.2099 + } 1.2100 + /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ 1.2101 + lastToken = sourceToken; 1.2102 + } else { 1.2103 + if(U_FAILURE(*status)) { 1.2104 + return 0; 1.2105 + } 1.2106 + } 1.2107 + } 1.2108 +#ifdef DEBUG_FOR_CODE_POINTS 1.2109 + fclose(dfcp_fp); 1.2110 +#endif 1.2111 + 1.2112 + 1.2113 + if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { 1.2114 + src->resultLen--; 1.2115 + } 1.2116 + return src->resultLen; 1.2117 +} 1.2118 + 1.2119 +const UChar* ucol_tok_getRulesFromBundle( 1.2120 + void* /*context*/, 1.2121 + const char* locale, 1.2122 + const char* type, 1.2123 + int32_t* pLength, 1.2124 + UErrorCode* status) 1.2125 +{ 1.2126 + const UChar* rules = NULL; 1.2127 + UResourceBundle* bundle; 1.2128 + UResourceBundle* collations; 1.2129 + UResourceBundle* collation; 1.2130 + 1.2131 + *pLength = 0; 1.2132 + 1.2133 + bundle = ures_open(U_ICUDATA_COLL, locale, status); 1.2134 + if(U_SUCCESS(*status)){ 1.2135 + collations = ures_getByKey(bundle, "collations", NULL, status); 1.2136 + if(U_SUCCESS(*status)){ 1.2137 + collation = ures_getByKey(collations, type, NULL, status); 1.2138 + if(U_SUCCESS(*status)){ 1.2139 + rules = ures_getStringByKey(collation, "Sequence", pLength, status); 1.2140 + if(U_FAILURE(*status)){ 1.2141 + *pLength = 0; 1.2142 + rules = NULL; 1.2143 + } 1.2144 + ures_close(collation); 1.2145 + } 1.2146 + ures_close(collations); 1.2147 + } 1.2148 + } 1.2149 + 1.2150 + ures_close(bundle); 1.2151 + 1.2152 + return rules; 1.2153 +} 1.2154 + 1.2155 +void ucol_tok_initTokenList( 1.2156 + UColTokenParser *src, 1.2157 + const UChar *rules, 1.2158 + uint32_t rulesLength, 1.2159 + const UCollator *UCA, 1.2160 + GetCollationRulesFunction importFunc, 1.2161 + void* context, 1.2162 + UErrorCode *status) { 1.2163 + U_NAMESPACE_USE 1.2164 + 1.2165 + uint32_t nSize = 0; 1.2166 + uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); 1.2167 + 1.2168 + bool needToDeallocRules = false; 1.2169 + 1.2170 + if(U_FAILURE(*status)) { 1.2171 + return; 1.2172 + } 1.2173 + 1.2174 + // set everything to zero, so that we can clean up gracefully 1.2175 + uprv_memset(src, 0, sizeof(UColTokenParser)); 1.2176 + 1.2177 + // first we need to find options that don't like to be normalized, 1.2178 + // like copy and remove... 1.2179 + //const UChar *openBrace = rules; 1.2180 + int32_t optionNumber = -1; 1.2181 + const UChar *setStart = NULL; 1.2182 + uint32_t i = 0; 1.2183 + while(i < rulesLength) { 1.2184 + if(rules[i] == 0x005B) { // '[': start of an option 1.2185 + /* Gets the following: 1.2186 + optionNumber: The index of the option. 1.2187 + setStart: The pointer at which the option arguments start. 1.2188 + */ 1.2189 + optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); 1.2190 + 1.2191 + if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ 1.2192 + // [optimize] 1.2193 + USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 1.2194 + if(U_SUCCESS(*status)) { 1.2195 + if(src->copySet == NULL) { 1.2196 + src->copySet = newSet; 1.2197 + } else { 1.2198 + uset_addAll(src->copySet, newSet); 1.2199 + uset_close(newSet); 1.2200 + } 1.2201 + } else { 1.2202 + return; 1.2203 + } 1.2204 + } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { 1.2205 + USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); 1.2206 + if(U_SUCCESS(*status)) { 1.2207 + if(src->removeSet == NULL) { 1.2208 + src->removeSet = newSet; 1.2209 + } else { 1.2210 + uset_addAll(src->removeSet, newSet); 1.2211 + uset_close(newSet); 1.2212 + } 1.2213 + } else { 1.2214 + return; 1.2215 + } 1.2216 + } else if(optionNumber == OPTION_IMPORT){ 1.2217 + // [import <collation-name>] 1.2218 + 1.2219 + // Find the address of the closing ]. 1.2220 + UChar* import_end = u_strchr(setStart, 0x005D); 1.2221 + int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); 1.2222 + // Ignore trailing whitespace. 1.2223 + while(PatternProps::isWhiteSpace(*(import_end-1))) { 1.2224 + --import_end; 1.2225 + } 1.2226 + 1.2227 + int32_t optionLength = (int32_t)(import_end - setStart); 1.2228 + char option[50]; 1.2229 + if(optionLength >= (int32_t)sizeof(option)) { 1.2230 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2231 + return; 1.2232 + } 1.2233 + u_UCharsToChars(setStart, option, optionLength); 1.2234 + option[optionLength] = 0; 1.2235 + 1.2236 + *status = U_ZERO_ERROR; 1.2237 + char locale[50]; 1.2238 + int32_t templ; 1.2239 + uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); 1.2240 + if(U_FAILURE(*status)) { 1.2241 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2242 + return; 1.2243 + } 1.2244 + 1.2245 + char type[50]; 1.2246 + if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || 1.2247 + U_FAILURE(*status) 1.2248 + ) { 1.2249 + *status = U_ZERO_ERROR; 1.2250 + uprv_strcpy(type, "standard"); 1.2251 + } 1.2252 + 1.2253 + // TODO: Use public functions when available, see ticket #8134. 1.2254 + char *keywords = (char *)locale_getKeywordsStart(locale); 1.2255 + if(keywords != NULL) { 1.2256 + *keywords = 0; 1.2257 + } 1.2258 + 1.2259 + int32_t importRulesLength = 0; 1.2260 + const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); 1.2261 + 1.2262 +#ifdef DEBUG_FOR_COLL_RULES 1.2263 + std::string s; 1.2264 + UnicodeString(importRules).toUTF8String(s); 1.2265 + std::cout << "Import rules = " << s << std::endl; 1.2266 +#endif 1.2267 + 1.2268 + // Add the length of the imported rules to length of the original rules, 1.2269 + // and subtract the length of the import option. 1.2270 + uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); 1.2271 + 1.2272 + UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); 1.2273 + 1.2274 +#ifdef DEBUG_FOR_COLL_RULES 1.2275 + std::string s1; 1.2276 + UnicodeString(rules).toUTF8String(s1); 1.2277 + std::cout << "Original rules = " << s1 << std::endl; 1.2278 +#endif 1.2279 + 1.2280 + 1.2281 + // Copy the section of the original rules leading up to the import 1.2282 + uprv_memcpy(newRules, rules, i*sizeof(UChar)); 1.2283 + // Copy the imported rules 1.2284 + uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); 1.2285 + // Copy the rest of the original rules (minus the import option itself) 1.2286 + uprv_memcpy(newRules+i+importRulesLength, 1.2287 + rules+optionEndOffset, 1.2288 + (rulesLength-optionEndOffset)*sizeof(UChar)); 1.2289 + 1.2290 +#ifdef DEBUG_FOR_COLL_RULES 1.2291 + std::string s2; 1.2292 + UnicodeString(newRules).toUTF8String(s2); 1.2293 + std::cout << "Resulting rules = " << s2 << std::endl; 1.2294 +#endif 1.2295 + 1.2296 + if(needToDeallocRules){ 1.2297 + // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 1.2298 + uprv_free((void*)rules); 1.2299 + } 1.2300 + needToDeallocRules = true; 1.2301 + rules = newRules; 1.2302 + rulesLength = newRulesLength; 1.2303 + 1.2304 + estimatedSize += importRulesLength*2; 1.2305 + 1.2306 + // First character of the new rules needs to be processed 1.2307 + i--; 1.2308 + } 1.2309 + } 1.2310 + //openBrace++; 1.2311 + i++; 1.2312 + } 1.2313 + 1.2314 + src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); 1.2315 + /* test for NULL */ 1.2316 + if (src->source == NULL) { 1.2317 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2318 + return; 1.2319 + } 1.2320 + uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); 1.2321 + nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); 1.2322 + if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { 1.2323 + *status = U_ZERO_ERROR; 1.2324 + src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); 1.2325 + /* test for NULL */ 1.2326 + if (src->source == NULL) { 1.2327 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2328 + return; 1.2329 + } 1.2330 + nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); 1.2331 + } 1.2332 + if(needToDeallocRules){ 1.2333 + // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free 1.2334 + uprv_free((void*)rules); 1.2335 + } 1.2336 + 1.2337 + 1.2338 + src->current = src->source; 1.2339 + src->end = src->source+nSize; 1.2340 + src->sourceCurrent = src->source; 1.2341 + src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly 1.2342 + src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1.2343 + src->varTop = NULL; 1.2344 + src->UCA = UCA; 1.2345 + src->invUCA = ucol_initInverseUCA(status); 1.2346 + src->parsedToken.charsLen = 0; 1.2347 + src->parsedToken.charsOffset = 0; 1.2348 + src->parsedToken.extensionLen = 0; 1.2349 + src->parsedToken.extensionOffset = 0; 1.2350 + src->parsedToken.prefixLen = 0; 1.2351 + src->parsedToken.prefixOffset = 0; 1.2352 + src->parsedToken.flags = 0; 1.2353 + src->parsedToken.strength = UCOL_TOK_UNSET; 1.2354 + src->buildCCTabFlag = FALSE; 1.2355 + src->isStarred = FALSE; 1.2356 + src->inRange = FALSE; 1.2357 + src->lastRangeCp = 0; 1.2358 + src->previousCp = 0; 1.2359 + 1.2360 + if(U_FAILURE(*status)) { 1.2361 + return; 1.2362 + } 1.2363 + src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); 1.2364 + if(U_FAILURE(*status)) { 1.2365 + return; 1.2366 + } 1.2367 + uhash_setValueDeleter(src->tailored, uprv_free); 1.2368 + 1.2369 + src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); 1.2370 + /* test for NULL */ 1.2371 + if (src->opts == NULL) { 1.2372 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2373 + return; 1.2374 + } 1.2375 + 1.2376 + uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); 1.2377 + 1.2378 + src->lh = 0; 1.2379 + src->listCapacity = 1024; 1.2380 + src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); 1.2381 + //Test for NULL 1.2382 + if (src->lh == NULL) { 1.2383 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2384 + return; 1.2385 + } 1.2386 + uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); 1.2387 + src->resultLen = 0; 1.2388 + 1.2389 + UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 1.2390 + 1.2391 + // UCOL_RESET_TOP_VALUE 1.2392 + setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 1.2393 + // UCOL_FIRST_PRIMARY_IGNORABLE 1.2394 + setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); 1.2395 + // UCOL_LAST_PRIMARY_IGNORABLE 1.2396 + setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); 1.2397 + // UCOL_FIRST_SECONDARY_IGNORABLE 1.2398 + setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); 1.2399 + // UCOL_LAST_SECONDARY_IGNORABLE 1.2400 + setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); 1.2401 + // UCOL_FIRST_TERTIARY_IGNORABLE 1.2402 + setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); 1.2403 + // UCOL_LAST_TERTIARY_IGNORABLE 1.2404 + setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); 1.2405 + // UCOL_FIRST_VARIABLE 1.2406 + setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); 1.2407 + // UCOL_LAST_VARIABLE 1.2408 + setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); 1.2409 + // UCOL_FIRST_NON_VARIABLE 1.2410 + setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); 1.2411 + // UCOL_LAST_NON_VARIABLE 1.2412 + setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); 1.2413 + // UCOL_FIRST_IMPLICIT 1.2414 + setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); 1.2415 + // UCOL_LAST_IMPLICIT 1.2416 + setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); 1.2417 + // UCOL_FIRST_TRAILING 1.2418 + setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); 1.2419 + // UCOL_LAST_TRAILING 1.2420 + setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); 1.2421 + ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); 1.2422 +} 1.2423 + 1.2424 + 1.2425 +void ucol_tok_closeTokenList(UColTokenParser *src) { 1.2426 + if(src->copySet != NULL) { 1.2427 + uset_close(src->copySet); 1.2428 + } 1.2429 + if(src->removeSet != NULL) { 1.2430 + uset_close(src->removeSet); 1.2431 + } 1.2432 + if(src->tailored != NULL) { 1.2433 + uhash_close(src->tailored); 1.2434 + } 1.2435 + if(src->lh != NULL) { 1.2436 + uprv_free(src->lh); 1.2437 + } 1.2438 + if(src->source != NULL) { 1.2439 + uprv_free(src->source); 1.2440 + } 1.2441 + if(src->opts != NULL) { 1.2442 + uprv_free(src->opts); 1.2443 + } 1.2444 + if (src->reorderCodes != NULL) { 1.2445 + uprv_free(src->reorderCodes); 1.2446 + } 1.2447 +} 1.2448 + 1.2449 +#endif /* #if !UCONFIG_NO_COLLATION */