intl/icu/source/i18n/ucol_tok.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucol_tok.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created 02/22/2001
michael@0 14 * created by: Vladimir Weinstein
michael@0 15 *
michael@0 16 * This module reads a tailoring rule string and produces a list of
michael@0 17 * tokens that will be turned into collation elements
michael@0 18 *
michael@0 19 */
michael@0 20
michael@0 21 #include "unicode/utypes.h"
michael@0 22
michael@0 23 #if !UCONFIG_NO_COLLATION
michael@0 24
michael@0 25 #include "unicode/uscript.h"
michael@0 26 #include "unicode/ustring.h"
michael@0 27 #include "unicode/uchar.h"
michael@0 28 #include "unicode/uniset.h"
michael@0 29
michael@0 30 #include "cmemory.h"
michael@0 31 #include "cstring.h"
michael@0 32 #include "patternprops.h"
michael@0 33 #include "ucol_bld.h"
michael@0 34 #include "ucol_tok.h"
michael@0 35 #include "ulocimp.h"
michael@0 36 #include "uresimp.h"
michael@0 37
michael@0 38 // Define this only for debugging.
michael@0 39 // #define DEBUG_FOR_COLL_RULES 1
michael@0 40
michael@0 41 #ifdef DEBUG_FOR_COLL_RULES
michael@0 42 #include <iostream>
michael@0 43 #endif
michael@0 44
michael@0 45 U_NAMESPACE_USE
michael@0 46
michael@0 47 U_CDECL_BEGIN
michael@0 48 static int32_t U_CALLCONV
michael@0 49 uhash_hashTokens(const UHashTok k)
michael@0 50 {
michael@0 51 int32_t hash = 0;
michael@0 52 //uint32_t key = (uint32_t)k.integer;
michael@0 53 UColToken *key = (UColToken *)k.pointer;
michael@0 54 if (key != 0) {
michael@0 55 int32_t len = (key->source & 0xFF000000)>>24;
michael@0 56 int32_t inc = ((len - 32) / 32) + 1;
michael@0 57
michael@0 58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
michael@0 59 const UChar *limit = p + len;
michael@0 60
michael@0 61 while (p<limit) {
michael@0 62 hash = (hash * 37) + *p;
michael@0 63 p += inc;
michael@0 64 }
michael@0 65 }
michael@0 66 return hash;
michael@0 67 }
michael@0 68
michael@0 69 static UBool U_CALLCONV
michael@0 70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
michael@0 71 {
michael@0 72 //uint32_t p1 = (uint32_t) key1.integer;
michael@0 73 //uint32_t p2 = (uint32_t) key2.integer;
michael@0 74 UColToken *p1 = (UColToken *)key1.pointer;
michael@0 75 UColToken *p2 = (UColToken *)key2.pointer;
michael@0 76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
michael@0 77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
michael@0 78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
michael@0 79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
michael@0 80 const UChar *end = s1+s1L-1;
michael@0 81
michael@0 82 if (p1 == p2) {
michael@0 83 return TRUE;
michael@0 84 }
michael@0 85 if (p1->source == 0 || p2->source == 0) {
michael@0 86 return FALSE;
michael@0 87 }
michael@0 88 if(s1L != s2L) {
michael@0 89 return FALSE;
michael@0 90 }
michael@0 91 if(p1->source == p2->source) {
michael@0 92 return TRUE;
michael@0 93 }
michael@0 94 while((s1 < end) && *s1 == *s2) {
michael@0 95 ++s1;
michael@0 96 ++s2;
michael@0 97 }
michael@0 98 if(*s1 == *s2) {
michael@0 99 return TRUE;
michael@0 100 } else {
michael@0 101 return FALSE;
michael@0 102 }
michael@0 103 }
michael@0 104 U_CDECL_END
michael@0 105
michael@0 106 /*
michael@0 107 * Debug messages used to pinpoint where a format error occurred.
michael@0 108 * A better way is to include context-sensitive information in syntaxError() function.
michael@0 109 *
michael@0 110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
michael@0 111 * in the compile line.
michael@0 112 */
michael@0 113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
michael@0 114
michael@0 115 #ifdef DEBUG_FOR_FORMAT_ERROR
michael@0 116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
michael@0 117 #else
michael@0 118 #define DBG_FORMAT_ERROR
michael@0 119 #endif
michael@0 120
michael@0 121
michael@0 122 /*
michael@0 123 * Controls debug messages so that the output can be compared before and after a
michael@0 124 * big change. Prints the information of every code point that comes out of the
michael@0 125 * collation parser and its strength into a file. When a big change in format
michael@0 126 * happens, the files before and after the change should be identical.
michael@0 127 *
michael@0 128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
michael@0 129 * in the compile line.
michael@0 130 */
michael@0 131 // #define DEBUG_FOR_CODE_POINTS 1
michael@0 132
michael@0 133 #ifdef DEBUG_FOR_CODE_POINTS
michael@0 134 FILE* dfcp_fp = NULL;
michael@0 135 #endif
michael@0 136
michael@0 137
michael@0 138 typedef struct {
michael@0 139 uint32_t startCE;
michael@0 140 uint32_t startContCE;
michael@0 141 uint32_t limitCE;
michael@0 142 uint32_t limitContCE;
michael@0 143 } indirectBoundaries;
michael@0 144
michael@0 145 /* these values are used for finding CE values for indirect positioning. */
michael@0 146 /* Indirect positioning is a mechanism for allowing resets on symbolic */
michael@0 147 /* values. It only works for resets and you cannot tailor indirect names */
michael@0 148 /* An indirect name can define either an anchor point or a range. An */
michael@0 149 /* anchor point behaves in exactly the same way as a code point in reset */
michael@0 150 /* would, except that it cannot be tailored. A range (we currently only */
michael@0 151 /* know for the [top] range will explicitly set the upper bound for */
michael@0 152 /* generated CEs, thus allowing for better control over how many CEs can */
michael@0 153 /* be squeezed between in the range without performance penalty. */
michael@0 154 /* In that respect, we use [top] for tailoring of locales that use CJK */
michael@0 155 /* characters. Other indirect values are currently a pure convenience, */
michael@0 156 /* they can be used to assure that the CEs will be always positioned in */
michael@0 157 /* the same place relative to a point with known properties (e.g. first */
michael@0 158 /* primary ignorable). */
michael@0 159 static indirectBoundaries ucolIndirectBoundaries[15];
michael@0 160 /*
michael@0 161 static indirectBoundaries ucolIndirectBoundaries[11] = {
michael@0 162 { UCOL_RESET_TOP_VALUE, 0,
michael@0 163 UCOL_NEXT_TOP_VALUE, 0 },
michael@0 164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
michael@0 165 0, 0 },
michael@0 166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
michael@0 167 0, 0 },
michael@0 168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
michael@0 169 0, 0 },
michael@0 170 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
michael@0 171 0, 0 },
michael@0 172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
michael@0 173 0, 0 },
michael@0 174 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
michael@0 175 0, 0 },
michael@0 176 { UCOL_FIRST_VARIABLE, 0,
michael@0 177 0, 0 },
michael@0 178 { UCOL_LAST_VARIABLE, 0,
michael@0 179 0, 0 },
michael@0 180 { UCOL_FIRST_NON_VARIABLE, 0,
michael@0 181 0, 0 },
michael@0 182 { UCOL_LAST_NON_VARIABLE, 0,
michael@0 183 0, 0 },
michael@0 184 };
michael@0 185 */
michael@0 186
michael@0 187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
michael@0 188
michael@0 189 // Set values for the top - TODO: once we have values for all the indirects, we are going
michael@0 190 // to initalize here.
michael@0 191 ucolIndirectBoundaries[indexR].startCE = start[0];
michael@0 192 ucolIndirectBoundaries[indexR].startContCE = start[1];
michael@0 193 if(end) {
michael@0 194 ucolIndirectBoundaries[indexR].limitCE = end[0];
michael@0 195 ucolIndirectBoundaries[indexR].limitContCE = end[1];
michael@0 196 } else {
michael@0 197 ucolIndirectBoundaries[indexR].limitCE = 0;
michael@0 198 ucolIndirectBoundaries[indexR].limitContCE = 0;
michael@0 199 }
michael@0 200 }
michael@0 201
michael@0 202
michael@0 203 static inline
michael@0 204 void syntaxError(const UChar* rules,
michael@0 205 int32_t pos,
michael@0 206 int32_t rulesLen,
michael@0 207 UParseError* parseError)
michael@0 208 {
michael@0 209 parseError->offset = pos;
michael@0 210 parseError->line = 0 ; /* we are not using line numbers */
michael@0 211
michael@0 212 // for pre-context
michael@0 213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
michael@0 214 int32_t stop = pos;
michael@0 215
michael@0 216 u_memcpy(parseError->preContext,rules+start,stop-start);
michael@0 217 //null terminate the buffer
michael@0 218 parseError->preContext[stop-start] = 0;
michael@0 219
michael@0 220 //for post-context
michael@0 221 start = pos+1;
michael@0 222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
michael@0 223 rulesLen;
michael@0 224
michael@0 225 if(start < stop) {
michael@0 226 u_memcpy(parseError->postContext,rules+start,stop-start);
michael@0 227 //null terminate the buffer
michael@0 228 parseError->postContext[stop-start]= 0;
michael@0 229 } else {
michael@0 230 parseError->postContext[0] = 0;
michael@0 231 }
michael@0 232 }
michael@0 233
michael@0 234 static
michael@0 235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
michael@0 236 switch(attrib) {
michael@0 237 case UCOL_HIRAGANA_QUATERNARY_MODE:
michael@0 238 opts->hiraganaQ = value;
michael@0 239 break;
michael@0 240 case UCOL_FRENCH_COLLATION:
michael@0 241 opts->frenchCollation = value;
michael@0 242 break;
michael@0 243 case UCOL_ALTERNATE_HANDLING:
michael@0 244 opts->alternateHandling = value;
michael@0 245 break;
michael@0 246 case UCOL_CASE_FIRST:
michael@0 247 opts->caseFirst = value;
michael@0 248 break;
michael@0 249 case UCOL_CASE_LEVEL:
michael@0 250 opts->caseLevel = value;
michael@0 251 break;
michael@0 252 case UCOL_NORMALIZATION_MODE:
michael@0 253 opts->normalizationMode = value;
michael@0 254 break;
michael@0 255 case UCOL_STRENGTH:
michael@0 256 opts->strength = value;
michael@0 257 break;
michael@0 258 case UCOL_NUMERIC_COLLATION:
michael@0 259 opts->numericCollation = value;
michael@0 260 break;
michael@0 261 case UCOL_ATTRIBUTE_COUNT:
michael@0 262 default:
michael@0 263 break;
michael@0 264 }
michael@0 265 }
michael@0 266
michael@0 267 #define UTOK_OPTION_COUNT 22
michael@0 268
michael@0 269 static UBool didInit = FALSE;
michael@0 270 /* we can be strict, or we can be lenient */
michael@0 271 /* I'd surely be lenient with the option arguments */
michael@0 272 /* maybe even with options */
michael@0 273 U_STRING_DECL(suboption_00, "non-ignorable", 13);
michael@0 274 U_STRING_DECL(suboption_01, "shifted", 7);
michael@0 275
michael@0 276 U_STRING_DECL(suboption_02, "lower", 5);
michael@0 277 U_STRING_DECL(suboption_03, "upper", 5);
michael@0 278 U_STRING_DECL(suboption_04, "off", 3);
michael@0 279 U_STRING_DECL(suboption_05, "on", 2);
michael@0 280 U_STRING_DECL(suboption_06, "1", 1);
michael@0 281 U_STRING_DECL(suboption_07, "2", 1);
michael@0 282 U_STRING_DECL(suboption_08, "3", 1);
michael@0 283 U_STRING_DECL(suboption_09, "4", 1);
michael@0 284 U_STRING_DECL(suboption_10, "I", 1);
michael@0 285
michael@0 286 U_STRING_DECL(suboption_11, "primary", 7);
michael@0 287 U_STRING_DECL(suboption_12, "secondary", 9);
michael@0 288 U_STRING_DECL(suboption_13, "tertiary", 8);
michael@0 289 U_STRING_DECL(suboption_14, "variable", 8);
michael@0 290 U_STRING_DECL(suboption_15, "regular", 7);
michael@0 291 U_STRING_DECL(suboption_16, "implicit", 8);
michael@0 292 U_STRING_DECL(suboption_17, "trailing", 8);
michael@0 293
michael@0 294
michael@0 295 U_STRING_DECL(option_00, "undefined", 9);
michael@0 296 U_STRING_DECL(option_01, "rearrange", 9);
michael@0 297 U_STRING_DECL(option_02, "alternate", 9);
michael@0 298 U_STRING_DECL(option_03, "backwards", 9);
michael@0 299 U_STRING_DECL(option_04, "variable top", 12);
michael@0 300 U_STRING_DECL(option_05, "top", 3);
michael@0 301 U_STRING_DECL(option_06, "normalization", 13);
michael@0 302 U_STRING_DECL(option_07, "caseLevel", 9);
michael@0 303 U_STRING_DECL(option_08, "caseFirst", 9);
michael@0 304 U_STRING_DECL(option_09, "scriptOrder", 11);
michael@0 305 U_STRING_DECL(option_10, "charsetname", 11);
michael@0 306 U_STRING_DECL(option_11, "charset", 7);
michael@0 307 U_STRING_DECL(option_12, "before", 6);
michael@0 308 U_STRING_DECL(option_13, "hiraganaQ", 9);
michael@0 309 U_STRING_DECL(option_14, "strength", 8);
michael@0 310 U_STRING_DECL(option_15, "first", 5);
michael@0 311 U_STRING_DECL(option_16, "last", 4);
michael@0 312 U_STRING_DECL(option_17, "optimize", 8);
michael@0 313 U_STRING_DECL(option_18, "suppressContractions", 20);
michael@0 314 U_STRING_DECL(option_19, "numericOrdering", 15);
michael@0 315 U_STRING_DECL(option_20, "import", 6);
michael@0 316 U_STRING_DECL(option_21, "reorder", 7);
michael@0 317
michael@0 318 /*
michael@0 319 [last variable] last variable value
michael@0 320 [last primary ignorable] largest CE for primary ignorable
michael@0 321 [last secondary ignorable] largest CE for secondary ignorable
michael@0 322 [last tertiary ignorable] largest CE for tertiary ignorable
michael@0 323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
michael@0 324 */
michael@0 325
michael@0 326
michael@0 327 static const ucolTokSuboption alternateSub[2] = {
michael@0 328 {suboption_00, 13, UCOL_NON_IGNORABLE},
michael@0 329 {suboption_01, 7, UCOL_SHIFTED}
michael@0 330 };
michael@0 331
michael@0 332 static const ucolTokSuboption caseFirstSub[3] = {
michael@0 333 {suboption_02, 5, UCOL_LOWER_FIRST},
michael@0 334 {suboption_03, 5, UCOL_UPPER_FIRST},
michael@0 335 {suboption_04, 3, UCOL_OFF},
michael@0 336 };
michael@0 337
michael@0 338 static const ucolTokSuboption onOffSub[2] = {
michael@0 339 {suboption_04, 3, UCOL_OFF},
michael@0 340 {suboption_05, 2, UCOL_ON}
michael@0 341 };
michael@0 342
michael@0 343 static const ucolTokSuboption frenchSub[1] = {
michael@0 344 {suboption_07, 1, UCOL_ON}
michael@0 345 };
michael@0 346
michael@0 347 static const ucolTokSuboption beforeSub[3] = {
michael@0 348 {suboption_06, 1, UCOL_PRIMARY},
michael@0 349 {suboption_07, 1, UCOL_SECONDARY},
michael@0 350 {suboption_08, 1, UCOL_TERTIARY}
michael@0 351 };
michael@0 352
michael@0 353 static const ucolTokSuboption strengthSub[5] = {
michael@0 354 {suboption_06, 1, UCOL_PRIMARY},
michael@0 355 {suboption_07, 1, UCOL_SECONDARY},
michael@0 356 {suboption_08, 1, UCOL_TERTIARY},
michael@0 357 {suboption_09, 1, UCOL_QUATERNARY},
michael@0 358 {suboption_10, 1, UCOL_IDENTICAL},
michael@0 359 };
michael@0 360
michael@0 361 static const ucolTokSuboption firstLastSub[7] = {
michael@0 362 {suboption_11, 7, UCOL_PRIMARY},
michael@0 363 {suboption_12, 9, UCOL_PRIMARY},
michael@0 364 {suboption_13, 8, UCOL_PRIMARY},
michael@0 365 {suboption_14, 8, UCOL_PRIMARY},
michael@0 366 {suboption_15, 7, UCOL_PRIMARY},
michael@0 367 {suboption_16, 8, UCOL_PRIMARY},
michael@0 368 {suboption_17, 8, UCOL_PRIMARY},
michael@0 369 };
michael@0 370
michael@0 371 enum OptionNumber {
michael@0 372 OPTION_ALTERNATE_HANDLING = 0,
michael@0 373 OPTION_FRENCH_COLLATION,
michael@0 374 OPTION_CASE_LEVEL,
michael@0 375 OPTION_CASE_FIRST,
michael@0 376 OPTION_NORMALIZATION_MODE,
michael@0 377 OPTION_HIRAGANA_QUATERNARY,
michael@0 378 OPTION_STRENGTH,
michael@0 379 OPTION_NUMERIC_COLLATION,
michael@0 380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
michael@0 381 OPTION_VARIABLE_TOP,
michael@0 382 OPTION_REARRANGE,
michael@0 383 OPTION_BEFORE,
michael@0 384 OPTION_TOP,
michael@0 385 OPTION_FIRST,
michael@0 386 OPTION_LAST,
michael@0 387 OPTION_OPTIMIZE,
michael@0 388 OPTION_SUPPRESS_CONTRACTIONS,
michael@0 389 OPTION_UNDEFINED,
michael@0 390 OPTION_SCRIPT_ORDER,
michael@0 391 OPTION_CHARSET_NAME,
michael@0 392 OPTION_CHARSET,
michael@0 393 OPTION_IMPORT,
michael@0 394 OPTION_SCRIPTREORDER
michael@0 395 } ;
michael@0 396
michael@0 397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
michael@0 398 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
michael@0 399 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
michael@0 400 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
michael@0 401 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
michael@0 402 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
michael@0 403 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
michael@0 404 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
michael@0 405 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
michael@0 406 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
michael@0 407 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
michael@0 408 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
michael@0 409 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
michael@0 410 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
michael@0 411 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
michael@0 412 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
michael@0 413 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
michael@0 414 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
michael@0 415 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
michael@0 416 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
michael@0 417 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */
michael@0 418 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */
michael@0 419 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */
michael@0 420 };
michael@0 421
michael@0 422 static
michael@0 423 int32_t u_strncmpNoCase(const UChar *s1,
michael@0 424 const UChar *s2,
michael@0 425 int32_t n)
michael@0 426 {
michael@0 427 if(n > 0) {
michael@0 428 int32_t rc;
michael@0 429 for(;;) {
michael@0 430 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
michael@0 431 if(rc != 0 || *s1 == 0 || --n == 0) {
michael@0 432 return rc;
michael@0 433 }
michael@0 434 ++s1;
michael@0 435 ++s2;
michael@0 436 }
michael@0 437 }
michael@0 438 return 0;
michael@0 439 }
michael@0 440
michael@0 441 static
michael@0 442 void ucol_uprv_tok_initData() {
michael@0 443 if(!didInit) {
michael@0 444 U_STRING_INIT(suboption_00, "non-ignorable", 13);
michael@0 445 U_STRING_INIT(suboption_01, "shifted", 7);
michael@0 446
michael@0 447 U_STRING_INIT(suboption_02, "lower", 5);
michael@0 448 U_STRING_INIT(suboption_03, "upper", 5);
michael@0 449 U_STRING_INIT(suboption_04, "off", 3);
michael@0 450 U_STRING_INIT(suboption_05, "on", 2);
michael@0 451
michael@0 452 U_STRING_INIT(suboption_06, "1", 1);
michael@0 453 U_STRING_INIT(suboption_07, "2", 1);
michael@0 454 U_STRING_INIT(suboption_08, "3", 1);
michael@0 455 U_STRING_INIT(suboption_09, "4", 1);
michael@0 456 U_STRING_INIT(suboption_10, "I", 1);
michael@0 457
michael@0 458 U_STRING_INIT(suboption_11, "primary", 7);
michael@0 459 U_STRING_INIT(suboption_12, "secondary", 9);
michael@0 460 U_STRING_INIT(suboption_13, "tertiary", 8);
michael@0 461 U_STRING_INIT(suboption_14, "variable", 8);
michael@0 462 U_STRING_INIT(suboption_15, "regular", 7);
michael@0 463 U_STRING_INIT(suboption_16, "implicit", 8);
michael@0 464 U_STRING_INIT(suboption_17, "trailing", 8);
michael@0 465
michael@0 466
michael@0 467 U_STRING_INIT(option_00, "undefined", 9);
michael@0 468 U_STRING_INIT(option_01, "rearrange", 9);
michael@0 469 U_STRING_INIT(option_02, "alternate", 9);
michael@0 470 U_STRING_INIT(option_03, "backwards", 9);
michael@0 471 U_STRING_INIT(option_04, "variable top", 12);
michael@0 472 U_STRING_INIT(option_05, "top", 3);
michael@0 473 U_STRING_INIT(option_06, "normalization", 13);
michael@0 474 U_STRING_INIT(option_07, "caseLevel", 9);
michael@0 475 U_STRING_INIT(option_08, "caseFirst", 9);
michael@0 476 U_STRING_INIT(option_09, "scriptOrder", 11);
michael@0 477 U_STRING_INIT(option_10, "charsetname", 11);
michael@0 478 U_STRING_INIT(option_11, "charset", 7);
michael@0 479 U_STRING_INIT(option_12, "before", 6);
michael@0 480 U_STRING_INIT(option_13, "hiraganaQ", 9);
michael@0 481 U_STRING_INIT(option_14, "strength", 8);
michael@0 482 U_STRING_INIT(option_15, "first", 5);
michael@0 483 U_STRING_INIT(option_16, "last", 4);
michael@0 484 U_STRING_INIT(option_17, "optimize", 8);
michael@0 485 U_STRING_INIT(option_18, "suppressContractions", 20);
michael@0 486 U_STRING_INIT(option_19, "numericOrdering", 15);
michael@0 487 U_STRING_INIT(option_20, "import ", 6);
michael@0 488 U_STRING_INIT(option_21, "reorder", 7);
michael@0 489 didInit = TRUE;
michael@0 490 }
michael@0 491 }
michael@0 492
michael@0 493
michael@0 494 // This function reads basic options to set in the runtime collator
michael@0 495 // used by data driven tests. Should not support build time options
michael@0 496 U_CAPI const UChar * U_EXPORT2
michael@0 497 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
michael@0 498 UColAttribute *attrib, UColAttributeValue *value,
michael@0 499 UErrorCode *status)
michael@0 500 {
michael@0 501 uint32_t i = 0;
michael@0 502 int32_t j=0;
michael@0 503 UBool foundOption = FALSE;
michael@0 504 const UChar *optionArg = NULL;
michael@0 505
michael@0 506 ucol_uprv_tok_initData();
michael@0 507
michael@0 508 while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
michael@0 509 start++;
michael@0 510 }
michael@0 511 if(start >= end) {
michael@0 512 return NULL;
michael@0 513 }
michael@0 514 /* skip opening '[' */
michael@0 515 if(*start == 0x005b) {
michael@0 516 start++;
michael@0 517 } else {
michael@0 518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
michael@0 519 return NULL;
michael@0 520 }
michael@0 521
michael@0 522 while(i < UTOK_OPTION_COUNT) {
michael@0 523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
michael@0 524 foundOption = TRUE;
michael@0 525 if(end - start > rulesOptions[i].optionLen) {
michael@0 526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
michael@0 527 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
michael@0 528 optionArg++;
michael@0 529 }
michael@0 530 }
michael@0 531 break;
michael@0 532 }
michael@0 533 i++;
michael@0 534 }
michael@0 535
michael@0 536 if(!foundOption) {
michael@0 537 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 538 return NULL;
michael@0 539 }
michael@0 540
michael@0 541 if(optionArg) {
michael@0 542 for(j = 0; j<rulesOptions[i].subSize; j++) {
michael@0 543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
michael@0 544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
michael@0 545 *attrib = rulesOptions[i].attr;
michael@0 546 *value = rulesOptions[i].subopts[j].attrVal;
michael@0 547 optionArg += rulesOptions[i].subopts[j].subLen;
michael@0 548 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */
michael@0 549 optionArg++;
michael@0 550 }
michael@0 551 if(*optionArg == 0x005d) {
michael@0 552 optionArg++;
michael@0 553 return optionArg;
michael@0 554 } else {
michael@0 555 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 556 return NULL;
michael@0 557 }
michael@0 558 }
michael@0 559 }
michael@0 560 }
michael@0 561 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 562 return NULL;
michael@0 563 }
michael@0 564
michael@0 565 static
michael@0 566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
michael@0 567 while(*start != 0x005b) { /* advance while we find the first '[' */
michael@0 568 start++;
michael@0 569 }
michael@0 570 // now we need to get a balanced set of '[]'. The problem is that a set can have
michael@0 571 // many, and *end point to the first closing '['
michael@0 572 int32_t noOpenBraces = 1;
michael@0 573 int32_t current = 1; // skip the opening brace
michael@0 574 while(start+current < end && noOpenBraces != 0) {
michael@0 575 if(start[current] == 0x005b) {
michael@0 576 noOpenBraces++;
michael@0 577 } else if(start[current] == 0x005D) { // closing brace
michael@0 578 noOpenBraces--;
michael@0 579 }
michael@0 580 current++;
michael@0 581 }
michael@0 582
michael@0 583 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
michael@0 584 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 585 return NULL;
michael@0 586 }
michael@0 587 return uset_openPattern(start, current, status);
michael@0 588 }
michael@0 589
michael@0 590 /**
michael@0 591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
michael@0 592 * @param start Pointer to the start UChar.
michael@0 593 * @param end Pointer to the last valid pointer beyond which the option will not extend.
michael@0 594 * @param optionArg Address of the pointer at which the options start (after the option name)
michael@0 595 * @return The index of the option, or -1 if the option is not valid.
michael@0 596 */
michael@0 597 static
michael@0 598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
michael@0 599 int32_t i = 0;
michael@0 600 ucol_uprv_tok_initData();
michael@0 601
michael@0 602 while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */
michael@0 603 start++;
michael@0 604 }
michael@0 605 while(i < UTOK_OPTION_COUNT) {
michael@0 606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
michael@0 607 if(end - start > rulesOptions[i].optionLen) {
michael@0 608 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
michael@0 609 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */
michael@0 610 (*optionArg)++;
michael@0 611 }
michael@0 612 }
michael@0 613 break;
michael@0 614 }
michael@0 615 i++;
michael@0 616 }
michael@0 617 if(i == UTOK_OPTION_COUNT) {
michael@0 618 i = -1; // didn't find an option
michael@0 619 }
michael@0 620 return i;
michael@0 621 }
michael@0 622
michael@0 623
michael@0 624 static
michael@0 625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
michael@0 626 int32_t codeCount = 0;
michael@0 627 int32_t codeIndex = 0;
michael@0 628 char conversion[64];
michael@0 629 int32_t tokenLength = 0;
michael@0 630 const UChar* space;
michael@0 631
michael@0 632 const UChar* current = src->current;
michael@0 633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
michael@0 634
michael@0 635 // eat leading whitespace
michael@0 636 while(current < end && u_isWhitespace(*current)) {
michael@0 637 current++;
michael@0 638 }
michael@0 639
michael@0 640 while(current < end) {
michael@0 641 space = u_memchr(current, 0x0020, end - current);
michael@0 642 space = space == 0 ? end : space;
michael@0 643 tokenLength = space - current;
michael@0 644 if (tokenLength < 4) {
michael@0 645 *status = U_INVALID_FORMAT_ERROR;
michael@0 646 return;
michael@0 647 }
michael@0 648 codeCount++;
michael@0 649 current += tokenLength;
michael@0 650 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
michael@0 651 ++current;
michael@0 652 }
michael@0 653 }
michael@0 654
michael@0 655 if (codeCount == 0) {
michael@0 656 *status = U_INVALID_FORMAT_ERROR;
michael@0 657 }
michael@0 658
michael@0 659 src->reorderCodesLength = codeCount;
michael@0 660 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
michael@0 661 current = src->current;
michael@0 662
michael@0 663 // eat leading whitespace
michael@0 664 while(current < end && u_isWhitespace(*current)) {
michael@0 665 current++;
michael@0 666 }
michael@0 667
michael@0 668 while(current < end) {
michael@0 669 space = u_memchr(current, 0x0020, end - current);
michael@0 670 space = space == 0 ? end : space;
michael@0 671 tokenLength = space - current;
michael@0 672 if (tokenLength < 4) {
michael@0 673 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 674 return;
michael@0 675 } else {
michael@0 676 u_UCharsToChars(current, conversion, tokenLength);
michael@0 677 conversion[tokenLength] = '\0';
michael@0 678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
michael@0 679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
michael@0 680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
michael@0 681 }
michael@0 682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
michael@0 683 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 684 }
michael@0 685 }
michael@0 686 codeIndex++;
michael@0 687 current += tokenLength;
michael@0 688 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
michael@0 689 ++current;
michael@0 690 }
michael@0 691 }
michael@0 692 }
michael@0 693
michael@0 694 // reads and conforms to various options in rules
michael@0 695 // end is the position of the first closing ']'
michael@0 696 // However, some of the options take an UnicodeSet definition
michael@0 697 // which needs to duplicate the closing ']'
michael@0 698 // for example: '[copy [\uAC00-\uD7FF]]'
michael@0 699 // These options will move end to the second ']' and the
michael@0 700 // caller will set the current to it.
michael@0 701 static
michael@0 702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
michael@0 703 const UChar* start = src->current;
michael@0 704 int32_t i = 0;
michael@0 705 int32_t j=0;
michael@0 706 const UChar *optionArg = NULL;
michael@0 707
michael@0 708 uint8_t result = 0;
michael@0 709
michael@0 710 start++; /*skip opening '['*/
michael@0 711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
michael@0 712 if(optionArg) {
michael@0 713 src->current = optionArg;
michael@0 714 }
michael@0 715
michael@0 716 if(i < 0) {
michael@0 717 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 718 } else {
michael@0 719 int32_t noOpenBraces = 1;
michael@0 720 switch(i) {
michael@0 721 case OPTION_ALTERNATE_HANDLING:
michael@0 722 case OPTION_FRENCH_COLLATION:
michael@0 723 case OPTION_CASE_LEVEL:
michael@0 724 case OPTION_CASE_FIRST:
michael@0 725 case OPTION_NORMALIZATION_MODE:
michael@0 726 case OPTION_HIRAGANA_QUATERNARY:
michael@0 727 case OPTION_STRENGTH:
michael@0 728 case OPTION_NUMERIC_COLLATION:
michael@0 729 if(optionArg) {
michael@0 730 for(j = 0; j<rulesOptions[i].subSize; j++) {
michael@0 731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
michael@0 732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
michael@0 733 result = UCOL_TOK_SUCCESS;
michael@0 734 }
michael@0 735 }
michael@0 736 }
michael@0 737 if(result == 0) {
michael@0 738 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 739 }
michael@0 740 break;
michael@0 741 case OPTION_VARIABLE_TOP:
michael@0 742 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
michael@0 743 break;
michael@0 744 case OPTION_REARRANGE:
michael@0 745 result = UCOL_TOK_SUCCESS;
michael@0 746 break;
michael@0 747 case OPTION_BEFORE:
michael@0 748 if(optionArg) {
michael@0 749 for(j = 0; j<rulesOptions[i].subSize; j++) {
michael@0 750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
michael@0 751 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
michael@0 752 }
michael@0 753 }
michael@0 754 }
michael@0 755 if(result == 0) {
michael@0 756 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 757 }
michael@0 758 break;
michael@0 759 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
michael@0 760 /* index to this array will be src->parsedToken.indirectIndex*/
michael@0 761 src->parsedToken.indirectIndex = 0;
michael@0 762 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
michael@0 763 break;
michael@0 764 case OPTION_FIRST:
michael@0 765 case OPTION_LAST: /* first, last */
michael@0 766 for(j = 0; j<rulesOptions[i].subSize; j++) {
michael@0 767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
michael@0 768 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
michael@0 769 // element of indirect boundaries is reserved for top.
michael@0 770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
michael@0 771 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
michael@0 772 }
michael@0 773 }
michael@0 774 if(result == 0) {
michael@0 775 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 776 }
michael@0 777 break;
michael@0 778 case OPTION_OPTIMIZE:
michael@0 779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
michael@0 780 // we need to move end here
michael@0 781 src->current++; // skip opening brace
michael@0 782 while(src->current < src->end && noOpenBraces != 0) {
michael@0 783 if(*src->current == 0x005b) {
michael@0 784 noOpenBraces++;
michael@0 785 } else if(*src->current == 0x005D) { // closing brace
michael@0 786 noOpenBraces--;
michael@0 787 }
michael@0 788 src->current++;
michael@0 789 }
michael@0 790 result = UCOL_TOK_SUCCESS;
michael@0 791 break;
michael@0 792 case OPTION_SCRIPTREORDER:
michael@0 793 ucol_tok_parseScriptReorder(src, status);
michael@0 794 break;
michael@0 795 default:
michael@0 796 *status = U_UNSUPPORTED_ERROR;
michael@0 797 break;
michael@0 798 }
michael@0 799 }
michael@0 800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
michael@0 801 return result;
michael@0 802 }
michael@0 803
michael@0 804
michael@0 805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
michael@0 806 if (stuff == NULL || len <= 0) {
michael@0 807 return;
michael@0 808 }
michael@0 809 UnicodeString tempStuff(FALSE, stuff, len);
michael@0 810 if(src->extraCurrent+len >= src->extraEnd) {
michael@0 811 /* reallocate */
michael@0 812 if (stuff >= src->source && stuff <= src->end) {
michael@0 813 // Copy the "stuff" contents into tempStuff's own buffer.
michael@0 814 // UnicodeString is copy-on-write.
michael@0 815 if (len > 0) {
michael@0 816 tempStuff.setCharAt(0, tempStuff[0]);
michael@0 817 } else {
michael@0 818 tempStuff.remove();
michael@0 819 }
michael@0 820 }
michael@0 821 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
michael@0 822 if(newSrc != NULL) {
michael@0 823 src->current = newSrc + (src->current - src->source);
michael@0 824 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
michael@0 825 src->end = newSrc + (src->end - src->source);
michael@0 826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
michael@0 827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
michael@0 828 src->source = newSrc;
michael@0 829 } else {
michael@0 830 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 831 return;
michael@0 832 }
michael@0 833 }
michael@0 834 if(len == 1) {
michael@0 835 *src->extraCurrent++ = tempStuff[0];
michael@0 836 } else {
michael@0 837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
michael@0 838 src->extraCurrent += len;
michael@0 839 }
michael@0 840 }
michael@0 841
michael@0 842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
michael@0 843 /*
michael@0 844 top = TRUE;
michael@0 845 */
michael@0 846 UChar buff[5];
michael@0 847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 848 buff[0] = 0xFFFE;
michael@0 849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
michael@0 850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
michael@0 851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
michael@0 852 src->parsedToken.charsLen = 3;
michael@0 853 ucol_tok_addToExtraCurrent(src, buff, 3, status);
michael@0 854 } else {
michael@0 855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
michael@0 856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
michael@0 857 src->parsedToken.charsLen = 5;
michael@0 858 ucol_tok_addToExtraCurrent(src, buff, 5, status);
michael@0 859 }
michael@0 860 return TRUE;
michael@0 861 }
michael@0 862
michael@0 863 static UBool isCharNewLine(UChar c){
michael@0 864 switch(c){
michael@0 865 case 0x000A: /* LF */
michael@0 866 case 0x000D: /* CR */
michael@0 867 case 0x000C: /* FF */
michael@0 868 case 0x0085: /* NEL */
michael@0 869 case 0x2028: /* LS */
michael@0 870 case 0x2029: /* PS */
michael@0 871 return TRUE;
michael@0 872 default:
michael@0 873 return FALSE;
michael@0 874 }
michael@0 875 }
michael@0 876
michael@0 877 /*
michael@0 878 * This function is called several times when a range is processed. Each time, the next code point
michael@0 879 * is processed.
michael@0 880 * The following variables must be set before calling this function:
michael@0 881 * src->currentRangeCp: The current code point to process.
michael@0 882 * src->lastRangeCp: The last code point in the range.
michael@0 883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
michael@0 884 */
michael@0 885 static const UChar*
michael@0 886 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
michael@0 887 UErrorCode *status)
michael@0 888 {
michael@0 889 // Append current code point to source
michael@0 890 UChar buff[U16_MAX_LENGTH];
michael@0 891 uint32_t i = 0;
michael@0 892
michael@0 893 uint32_t nChars = U16_LENGTH(src->currentRangeCp);
michael@0 894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 895 src->parsedToken.charsLen = nChars;
michael@0 896
michael@0 897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
michael@0 898 ucol_tok_addToExtraCurrent(src, buff, nChars, status);
michael@0 899
michael@0 900 ++src->currentRangeCp;
michael@0 901 if (src->currentRangeCp > src->lastRangeCp) {
michael@0 902 src->inRange = FALSE;
michael@0 903
michael@0 904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
michael@0 905 src->isStarred = FALSE;
michael@0 906 }
michael@0 907 } else {
michael@0 908 src->previousCp = src->currentRangeCp;
michael@0 909 }
michael@0 910 return src->current;
michael@0 911 }
michael@0 912
michael@0 913 /*
michael@0 914 * This function is called several times when a starred list is processed. Each time, the next code point
michael@0 915 * in the list is processed.
michael@0 916 * The following variables must be set before calling this function:
michael@0 917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.
michael@0 918 * src->lastStarredCharIndex: Index to the last character in the list.
michael@0 919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
michael@0 920 */
michael@0 921 static const UChar*
michael@0 922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
michael@0 923 {
michael@0 924 // Extract the characters corresponding to the next code point.
michael@0 925 UChar32 cp;
michael@0 926 src->parsedToken.charsOffset = src->currentStarredCharIndex;
michael@0 927 int32_t prev = src->currentStarredCharIndex;
michael@0 928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
michael@0 929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
michael@0 930
michael@0 931 // When we are done parsing the starred string, turn the flag off so that
michael@0 932 // the normal processing is restored.
michael@0 933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
michael@0 934 src->isStarred = FALSE;
michael@0 935 }
michael@0 936 src->previousCp = cp;
michael@0 937 return src->current;
michael@0 938 }
michael@0 939
michael@0 940 /*
michael@0 941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
michael@0 942 *
michael@0 943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
michael@0 944 * # : Comment character
michael@0 945 * & : Reset operator
michael@0 946 * = : Equality
michael@0 947 * < : Primary collation
michael@0 948 * << : Secondary collation
michael@0 949 * <<< : Tertiary collation
michael@0 950 * ; : Secondary collation
michael@0 951 * , : Tertiary collation
michael@0 952 * / : Expansions
michael@0 953 * | : Prefix
michael@0 954 * - : Range
michael@0 955
michael@0 956 * ! : Java Thai modifier, ignored
michael@0 957 * @ : French only
michael@0 958
michael@0 959 * [] : Options
michael@0 960 * '' : Quotes
michael@0 961 *
michael@0 962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz
michael@0 963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
michael@0 964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",
michael@0 965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous
michael@0 966 * character returned as cached so that the calling program can do further splitting.
michael@0 967 */
michael@0 968 static const UChar*
michael@0 969 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
michael@0 970 UBool startOfRules,
michael@0 971 UParseError *parseError,
michael@0 972 UErrorCode *status)
michael@0 973 {
michael@0 974 UBool variableTop = FALSE;
michael@0 975 UBool top = FALSE;
michael@0 976 UBool inChars = TRUE;
michael@0 977 UBool inQuote = FALSE;
michael@0 978 UBool wasInQuote = FALSE;
michael@0 979 uint8_t before = 0;
michael@0 980 UBool isEscaped = FALSE;
michael@0 981
michael@0 982 // TODO: replace these variables with src->parsedToken counterparts
michael@0 983 // no need to use them anymore since we have src->parsedToken.
michael@0 984 // Ideally, token parser would be a nice class... Once, when I have
michael@0 985 // more time (around 2020 probably).
michael@0 986 uint32_t newExtensionLen = 0;
michael@0 987 uint32_t extensionOffset = 0;
michael@0 988 uint32_t newStrength = UCOL_TOK_UNSET;
michael@0 989 UChar buff[10];
michael@0 990
michael@0 991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
michael@0 992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
michael@0 993 src->parsedToken.indirectIndex = 0;
michael@0 994
michael@0 995 while (src->current < src->end) {
michael@0 996 UChar ch = *(src->current);
michael@0 997
michael@0 998 if (inQuote) {
michael@0 999 if (ch == 0x0027/*'\''*/) {
michael@0 1000 inQuote = FALSE;
michael@0 1001 } else {
michael@0 1002 if ((src->parsedToken.charsLen == 0) || inChars) {
michael@0 1003 if(src->parsedToken.charsLen == 0) {
michael@0 1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1005 }
michael@0 1006 src->parsedToken.charsLen++;
michael@0 1007 } else {
michael@0 1008 if(newExtensionLen == 0) {
michael@0 1009 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1010 }
michael@0 1011 newExtensionLen++;
michael@0 1012 }
michael@0 1013 }
michael@0 1014 }else if(isEscaped){
michael@0 1015 isEscaped =FALSE;
michael@0 1016 if (newStrength == UCOL_TOK_UNSET) {
michael@0 1017 *status = U_INVALID_FORMAT_ERROR;
michael@0 1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1019 DBG_FORMAT_ERROR
michael@0 1020 return NULL;
michael@0 1021 // enabling rules to start with non-tokens a < b
michael@0 1022 // newStrength = UCOL_TOK_RESET;
michael@0 1023 }
michael@0 1024 if(ch != 0x0000 && src->current != src->end) {
michael@0 1025 if (inChars) {
michael@0 1026 if(src->parsedToken.charsLen == 0) {
michael@0 1027 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
michael@0 1028 }
michael@0 1029 src->parsedToken.charsLen++;
michael@0 1030 } else {
michael@0 1031 if(newExtensionLen == 0) {
michael@0 1032 extensionOffset = (uint32_t)(src->current - src->source);
michael@0 1033 }
michael@0 1034 newExtensionLen++;
michael@0 1035 }
michael@0 1036 }
michael@0 1037 }else {
michael@0 1038 if(!PatternProps::isWhiteSpace(ch)) {
michael@0 1039 /* Sets the strength for this entry */
michael@0 1040 switch (ch) {
michael@0 1041 case 0x003D/*'='*/ :
michael@0 1042 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1043 goto EndOfLoop;
michael@0 1044 }
michael@0 1045
michael@0 1046 /* if we start with strength, we'll reset to top */
michael@0 1047 if(startOfRules == TRUE) {
michael@0 1048 src->parsedToken.indirectIndex = 5;
michael@0 1049 top = ucol_tok_doSetTop(src, status);
michael@0 1050 newStrength = UCOL_TOK_RESET;
michael@0 1051 goto EndOfLoop;
michael@0 1052 }
michael@0 1053 newStrength = UCOL_IDENTICAL;
michael@0 1054 if(*(src->current+1) == 0x002A) {/*'*'*/
michael@0 1055 src->current++;
michael@0 1056 src->isStarred = TRUE;
michael@0 1057 }
michael@0 1058 break;
michael@0 1059
michael@0 1060 case 0x002C/*','*/:
michael@0 1061 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1062 goto EndOfLoop;
michael@0 1063 }
michael@0 1064
michael@0 1065 /* if we start with strength, we'll reset to top */
michael@0 1066 if(startOfRules == TRUE) {
michael@0 1067 src->parsedToken.indirectIndex = 5;
michael@0 1068 top = ucol_tok_doSetTop(src, status);
michael@0 1069 newStrength = UCOL_TOK_RESET;
michael@0 1070 goto EndOfLoop;
michael@0 1071 }
michael@0 1072 newStrength = UCOL_TERTIARY;
michael@0 1073 break;
michael@0 1074
michael@0 1075 case 0x003B/*';'*/:
michael@0 1076 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1077 goto EndOfLoop;
michael@0 1078 }
michael@0 1079
michael@0 1080 /* if we start with strength, we'll reset to top */
michael@0 1081 if(startOfRules == TRUE) {
michael@0 1082 src->parsedToken.indirectIndex = 5;
michael@0 1083 top = ucol_tok_doSetTop(src, status);
michael@0 1084 newStrength = UCOL_TOK_RESET;
michael@0 1085 goto EndOfLoop;
michael@0 1086 }
michael@0 1087 newStrength = UCOL_SECONDARY;
michael@0 1088 break;
michael@0 1089
michael@0 1090 case 0x003C/*'<'*/:
michael@0 1091 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1092 goto EndOfLoop;
michael@0 1093 }
michael@0 1094
michael@0 1095 /* if we start with strength, we'll reset to top */
michael@0 1096 if(startOfRules == TRUE) {
michael@0 1097 src->parsedToken.indirectIndex = 5;
michael@0 1098 top = ucol_tok_doSetTop(src, status);
michael@0 1099 newStrength = UCOL_TOK_RESET;
michael@0 1100 goto EndOfLoop;
michael@0 1101 }
michael@0 1102 /* before this, do a scan to verify whether this is */
michael@0 1103 /* another strength */
michael@0 1104 if(*(src->current+1) == 0x003C) {
michael@0 1105 src->current++;
michael@0 1106 if(*(src->current+1) == 0x003C) {
michael@0 1107 src->current++; /* three in a row! */
michael@0 1108 newStrength = UCOL_TERTIARY;
michael@0 1109 } else { /* two in a row */
michael@0 1110 newStrength = UCOL_SECONDARY;
michael@0 1111 }
michael@0 1112 } else { /* just one */
michael@0 1113 newStrength = UCOL_PRIMARY;
michael@0 1114 }
michael@0 1115 if(*(src->current+1) == 0x002A) {/*'*'*/
michael@0 1116 src->current++;
michael@0 1117 src->isStarred = TRUE;
michael@0 1118 }
michael@0 1119 break;
michael@0 1120
michael@0 1121 case 0x0026/*'&'*/:
michael@0 1122 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1123 /**/
michael@0 1124 goto EndOfLoop;
michael@0 1125 }
michael@0 1126
michael@0 1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
michael@0 1128 break;
michael@0 1129
michael@0 1130 case 0x005b/*'['*/:
michael@0 1131 /* options - read an option, analyze it */
michael@0 1132 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
michael@0 1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
michael@0 1134 if(U_SUCCESS(*status)) {
michael@0 1135 if(result & UCOL_TOK_TOP) {
michael@0 1136 if(newStrength == UCOL_TOK_RESET) {
michael@0 1137 top = ucol_tok_doSetTop(src, status);
michael@0 1138 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
michael@0 1139 src->parsedToken.charsLen+=2;
michael@0 1140 buff[0] = 0x002d;
michael@0 1141 buff[1] = before;
michael@0 1142 ucol_tok_addToExtraCurrent(src, buff, 2, status);
michael@0 1143 }
michael@0 1144
michael@0 1145 src->current++;
michael@0 1146 goto EndOfLoop;
michael@0 1147 } else {
michael@0 1148 *status = U_INVALID_FORMAT_ERROR;
michael@0 1149 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1150 DBG_FORMAT_ERROR
michael@0 1151 }
michael@0 1152 } else if(result & UCOL_TOK_VARIABLE_TOP) {
michael@0 1153 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
michael@0 1154 variableTop = TRUE;
michael@0 1155 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1156 src->parsedToken.charsLen = 1;
michael@0 1157 buff[0] = 0xFFFF;
michael@0 1158 ucol_tok_addToExtraCurrent(src, buff, 1, status);
michael@0 1159 src->current++;
michael@0 1160 goto EndOfLoop;
michael@0 1161 } else {
michael@0 1162 *status = U_INVALID_FORMAT_ERROR;
michael@0 1163 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1164 DBG_FORMAT_ERROR
michael@0 1165 }
michael@0 1166 } else if (result & UCOL_TOK_BEFORE){
michael@0 1167 if(newStrength == UCOL_TOK_RESET) {
michael@0 1168 before = result & UCOL_TOK_BEFORE;
michael@0 1169 } else {
michael@0 1170 *status = U_INVALID_FORMAT_ERROR;
michael@0 1171 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1172 DBG_FORMAT_ERROR
michael@0 1173 }
michael@0 1174 }
michael@0 1175 } else {
michael@0 1176 *status = U_INVALID_FORMAT_ERROR;
michael@0 1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1178 DBG_FORMAT_ERROR
michael@0 1179 return NULL;
michael@0 1180 }
michael@0 1181 }
michael@0 1182 break;
michael@0 1183 case 0x0021/*! skip java thai modifier reordering*/:
michael@0 1184 break;
michael@0 1185 case 0x002F/*'/'*/:
michael@0 1186 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
michael@0 1187 inChars = FALSE; /* we're now processing expansion */
michael@0 1188 break;
michael@0 1189 case 0x005C /* back slash for escaped chars */:
michael@0 1190 isEscaped = TRUE;
michael@0 1191 break;
michael@0 1192 /* found a quote, we're gonna start copying */
michael@0 1193 case 0x0027/*'\''*/:
michael@0 1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
michael@0 1195 *status = U_INVALID_FORMAT_ERROR;
michael@0 1196 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1197 DBG_FORMAT_ERROR
michael@0 1198 return NULL;
michael@0 1199 // enabling rules to start with a non-token character a < b
michael@0 1200 // newStrength = UCOL_TOK_RESET;
michael@0 1201 }
michael@0 1202
michael@0 1203 inQuote = TRUE;
michael@0 1204
michael@0 1205 if(inChars) { /* we're doing characters */
michael@0 1206 if(wasInQuote == FALSE) {
michael@0 1207 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1208 }
michael@0 1209 if (src->parsedToken.charsLen != 0) {
michael@0 1210 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
michael@0 1211 }
michael@0 1212 src->parsedToken.charsLen++;
michael@0 1213 } else { /* we're doing an expansion */
michael@0 1214 if(wasInQuote == FALSE) {
michael@0 1215 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1216 }
michael@0 1217 if (newExtensionLen != 0) {
michael@0 1218 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
michael@0 1219 }
michael@0 1220 newExtensionLen++;
michael@0 1221 }
michael@0 1222
michael@0 1223 wasInQuote = TRUE;
michael@0 1224
michael@0 1225 ch = *(++(src->current));
michael@0 1226 if(ch == 0x0027) { /* copy the double quote */
michael@0 1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
michael@0 1228 inQuote = FALSE;
michael@0 1229 }
michael@0 1230 break;
michael@0 1231
michael@0 1232 /* '@' is french only if the strength is not currently set */
michael@0 1233 /* if it is, it's just a regular character in collation rules */
michael@0 1234 case 0x0040/*'@'*/:
michael@0 1235 if (newStrength == UCOL_TOK_UNSET) {
michael@0 1236 src->opts->frenchCollation = UCOL_ON;
michael@0 1237 break;
michael@0 1238 }
michael@0 1239
michael@0 1240 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
michael@0 1241 // we want to store read characters to the prefix part and continue reading
michael@0 1242 // the characters (proper way would be to restart reading the chars, but in
michael@0 1243 // that case we would have to complicate the token hasher, which I do not
michael@0 1244 // intend to play with. Instead, we will do prefixes when prefixes are due
michael@0 1245 // (before adding the elements).
michael@0 1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
michael@0 1247 src->parsedToken.prefixLen = src->parsedToken.charsLen;
michael@0 1248
michael@0 1249 if(inChars) { /* we're doing characters */
michael@0 1250 if(wasInQuote == FALSE) {
michael@0 1251 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1252 }
michael@0 1253 if (src->parsedToken.charsLen != 0) {
michael@0 1254 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
michael@0 1255 }
michael@0 1256 src->parsedToken.charsLen++;
michael@0 1257 }
michael@0 1258
michael@0 1259 wasInQuote = TRUE;
michael@0 1260
michael@0 1261 do {
michael@0 1262 ch = *(++(src->current));
michael@0 1263 // skip whitespace between '|' and the character
michael@0 1264 } while (PatternProps::isWhiteSpace(ch));
michael@0 1265 break;
michael@0 1266
michael@0 1267 //charsOffset = 0;
michael@0 1268 //newCharsLen = 0;
michael@0 1269 //break; // We want to store the whole prefix/character sequence. If we break
michael@0 1270 // the '|' is going to get lost.
michael@0 1271
michael@0 1272 case 0x002D /*-*/: /* A range. */
michael@0 1273 if (newStrength != UCOL_TOK_UNSET) {
michael@0 1274 // While processing the pending token, the isStarred field
michael@0 1275 // is reset, so it needs to be saved for the next
michael@0 1276 // invocation.
michael@0 1277 src->savedIsStarred = src->isStarred;
michael@0 1278 goto EndOfLoop;
michael@0 1279 }
michael@0 1280 src->isStarred = src->savedIsStarred;
michael@0 1281
michael@0 1282 // Ranges are valid only in starred tokens.
michael@0 1283 if (!src->isStarred) {
michael@0 1284 *status = U_INVALID_FORMAT_ERROR;
michael@0 1285 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1286 DBG_FORMAT_ERROR
michael@0 1287 return NULL;
michael@0 1288 }
michael@0 1289 newStrength = src->parsedToken.strength;
michael@0 1290 src->inRange = TRUE;
michael@0 1291 break;
michael@0 1292
michael@0 1293 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
michael@0 1294 do {
michael@0 1295 ch = *(++(src->current));
michael@0 1296 } while (!isCharNewLine(ch));
michael@0 1297
michael@0 1298 break;
michael@0 1299 default:
michael@0 1300 if (newStrength == UCOL_TOK_UNSET) {
michael@0 1301 *status = U_INVALID_FORMAT_ERROR;
michael@0 1302 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1303 DBG_FORMAT_ERROR
michael@0 1304 return NULL;
michael@0 1305 }
michael@0 1306
michael@0 1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
michael@0 1308 *status = U_INVALID_FORMAT_ERROR;
michael@0 1309 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1310 DBG_FORMAT_ERROR
michael@0 1311 return NULL;
michael@0 1312 }
michael@0 1313
michael@0 1314 if(ch == 0x0000 && src->current+1 == src->end) {
michael@0 1315 break;
michael@0 1316 }
michael@0 1317
michael@0 1318 if (inChars) {
michael@0 1319 if(src->parsedToken.charsLen == 0) {
michael@0 1320 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
michael@0 1321 }
michael@0 1322 src->parsedToken.charsLen++;
michael@0 1323 } else {
michael@0 1324 if(newExtensionLen == 0) {
michael@0 1325 extensionOffset = (uint32_t)(src->current - src->source);
michael@0 1326 }
michael@0 1327 newExtensionLen++;
michael@0 1328 }
michael@0 1329
michael@0 1330 break;
michael@0 1331 }
michael@0 1332 }
michael@0 1333 }
michael@0 1334
michael@0 1335 if(wasInQuote) {
michael@0 1336 if(ch != 0x27) {
michael@0 1337 if(inQuote || !PatternProps::isWhiteSpace(ch)) {
michael@0 1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
michael@0 1339 }
michael@0 1340 }
michael@0 1341 }
michael@0 1342
michael@0 1343 src->current++;
michael@0 1344 }
michael@0 1345
michael@0 1346 EndOfLoop:
michael@0 1347 wasInQuote = FALSE;
michael@0 1348 if (newStrength == UCOL_TOK_UNSET) {
michael@0 1349 return NULL;
michael@0 1350 }
michael@0 1351
michael@0 1352 if (src->parsedToken.charsLen == 0 && top == FALSE) {
michael@0 1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
michael@0 1354 *status = U_INVALID_FORMAT_ERROR;
michael@0 1355 DBG_FORMAT_ERROR
michael@0 1356 return NULL;
michael@0 1357 }
michael@0 1358
michael@0 1359 src->parsedToken.strength = newStrength;
michael@0 1360 src->parsedToken.extensionOffset = extensionOffset;
michael@0 1361 src->parsedToken.extensionLen = newExtensionLen;
michael@0 1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
michael@0 1363
michael@0 1364 return src->current;
michael@0 1365 }
michael@0 1366
michael@0 1367 /*
michael@0 1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
michael@0 1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
michael@0 1370 *
michael@0 1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
michael@0 1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates
michael@0 1373 * it to separate tokens and returns one by one. In order to do that, the necessary states are
michael@0 1374 * cached as member variables of the token parser.
michael@0 1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
michael@0 1376 * starting character as a single list token (which is separated into individual characters here)
michael@0 1377 * and as another list token starting with the last character in the range. Before expanding it
michael@0 1378 * as a list of tokens, this function expands the range by filling the intermediate characters and
michael@0 1379 * returns them one by one as separate tokens.
michael@0 1380 * Necessary checks are done for invalid combinations.
michael@0 1381 */
michael@0 1382 U_CAPI const UChar* U_EXPORT2
michael@0 1383 ucol_tok_parseNextToken(UColTokenParser *src,
michael@0 1384 UBool startOfRules,
michael@0 1385 UParseError *parseError,
michael@0 1386 UErrorCode *status)
michael@0 1387 {
michael@0 1388 const UChar *nextToken;
michael@0 1389
michael@0 1390 if (src->inRange) {
michael@0 1391 // We are not done processing a range. Continue it.
michael@0 1392 return ucol_tok_processNextCodePointInRange(src, status);
michael@0 1393 } else if (src->isStarred) {
michael@0 1394 // We are not done processing a starred token. Continue it.
michael@0 1395 return ucol_tok_processNextTokenInStarredList(src);
michael@0 1396 }
michael@0 1397
michael@0 1398 // Get the next token.
michael@0 1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
michael@0 1400
michael@0 1401 if (nextToken == NULL) {
michael@0 1402 return NULL;
michael@0 1403 }
michael@0 1404
michael@0 1405 if (src->inRange) {
michael@0 1406 // A new range has started.
michael@0 1407 // Check whether it is a chain of ranges with more than one hyphen.
michael@0 1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
michael@0 1409 *status = U_INVALID_FORMAT_ERROR;
michael@0 1410 syntaxError(src->source,src->parsedToken.charsOffset-1,
michael@0 1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
michael@0 1412 DBG_FORMAT_ERROR
michael@0 1413 return NULL;
michael@0 1414 }
michael@0 1415
michael@0 1416 // The current token indicates the second code point of the range.
michael@0 1417 // Process just that, and then proceed with the star.
michael@0 1418 src->currentStarredCharIndex = src->parsedToken.charsOffset;
michael@0 1419 U16_NEXT(src->source, src->currentStarredCharIndex,
michael@0 1420 (uint32_t)(src->end - src->source), src->lastRangeCp);
michael@0 1421 if (src->lastRangeCp <= src->previousCp) {
michael@0 1422 *status = U_INVALID_FORMAT_ERROR;
michael@0 1423 syntaxError(src->source,src->parsedToken.charsOffset-1,
michael@0 1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
michael@0 1425 DBG_FORMAT_ERROR
michael@0 1426 return NULL;
michael@0 1427 }
michael@0 1428
michael@0 1429 // Set current range code point to process the range loop
michael@0 1430 src->currentRangeCp = src->previousCp + 1;
michael@0 1431
michael@0 1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
michael@0 1433
michael@0 1434 return ucol_tok_processNextCodePointInRange(src, status);
michael@0 1435 } else if (src->isStarred) {
michael@0 1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
michael@0 1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
michael@0 1438 // separated into several tokens and returned.
michael@0 1439 src->currentStarredCharIndex = src->parsedToken.charsOffset;
michael@0 1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
michael@0 1441
michael@0 1442 return ucol_tok_processNextTokenInStarredList(src);
michael@0 1443 } else {
michael@0 1444 // Set previous codepoint
michael@0 1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
michael@0 1446 }
michael@0 1447 return nextToken;
michael@0 1448 }
michael@0 1449
michael@0 1450
michael@0 1451 /*
michael@0 1452 Processing Description
michael@0 1453 1 Build a ListList. Each list has a header, which contains two lists (positive
michael@0 1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
michael@0 1455 reset may be null.
michael@0 1456 2 As you process, you keep a LAST pointer that points to the last token you
michael@0 1457 handled.
michael@0 1458
michael@0 1459 */
michael@0 1460
michael@0 1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
michael@0 1462 UParseError *parseError, UErrorCode *status)
michael@0 1463 {
michael@0 1464 if(src->resultLen == src->listCapacity) {
michael@0 1465 // Unfortunately, this won't work, as we store addresses of lhs in token
michael@0 1466 src->listCapacity *= 2;
michael@0 1467 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
michael@0 1468 if(src->lh == NULL) {
michael@0 1469 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1470 return NULL;
michael@0 1471 }
michael@0 1472 }
michael@0 1473 /* do the reset thing */
michael@0 1474 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
michael@0 1475 /* test for NULL */
michael@0 1476 if (sourceToken == NULL) {
michael@0 1477 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1478 return NULL;
michael@0 1479 }
michael@0 1480 sourceToken->rulesToParseHdl = &(src->source);
michael@0 1481 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
michael@0 1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
michael@0 1483
michael@0 1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
michael@0 1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
michael@0 1486
michael@0 1487 // keep the flags around so that we know about before
michael@0 1488 sourceToken->flags = src->parsedToken.flags;
michael@0 1489
michael@0 1490 if(src->parsedToken.prefixOffset != 0) {
michael@0 1491 // this is a syntax error
michael@0 1492 *status = U_INVALID_FORMAT_ERROR;
michael@0 1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
michael@0 1494 DBG_FORMAT_ERROR
michael@0 1495 uprv_free(sourceToken);
michael@0 1496 return 0;
michael@0 1497 } else {
michael@0 1498 sourceToken->prefix = 0;
michael@0 1499 }
michael@0 1500
michael@0 1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
michael@0 1502 sourceToken->strength = UCOL_TOK_RESET;
michael@0 1503 sourceToken->next = NULL;
michael@0 1504 sourceToken->previous = NULL;
michael@0 1505 sourceToken->noOfCEs = 0;
michael@0 1506 sourceToken->noOfExpCEs = 0;
michael@0 1507 sourceToken->listHeader = &src->lh[src->resultLen];
michael@0 1508
michael@0 1509 src->lh[src->resultLen].first = NULL;
michael@0 1510 src->lh[src->resultLen].last = NULL;
michael@0 1511 src->lh[src->resultLen].first = NULL;
michael@0 1512 src->lh[src->resultLen].last = NULL;
michael@0 1513
michael@0 1514 src->lh[src->resultLen].reset = sourceToken;
michael@0 1515
michael@0 1516 /*
michael@0 1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
michael@0 1518 First convert all expansions into normal form. Examples:
michael@0 1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
michael@0 1520 d * ... into &x * c/y * d * ...
michael@0 1521 Note: reset values can never have expansions, although they can cause the
michael@0 1522 very next item to have one. They may be contractions, if they are found
michael@0 1523 earlier in the list.
michael@0 1524 */
michael@0 1525 *expandNext = 0;
michael@0 1526 if(expand != NULL) {
michael@0 1527 /* check to see if there is an expansion */
michael@0 1528 if(src->parsedToken.charsLen > 1) {
michael@0 1529 uint32_t resetCharsOffset;
michael@0 1530 resetCharsOffset = (uint32_t)(expand - src->source);
michael@0 1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
michael@0 1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
michael@0 1533 }
michael@0 1534 }
michael@0 1535
michael@0 1536 src->resultLen++;
michael@0 1537
michael@0 1538 uhash_put(src->tailored, sourceToken, sourceToken, status);
michael@0 1539
michael@0 1540 return sourceToken;
michael@0 1541 }
michael@0 1542
michael@0 1543 static
michael@0 1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
michael@0 1545 if(U_FAILURE(*status)) {
michael@0 1546 return NULL;
michael@0 1547 }
michael@0 1548 /* this is a virgin before - we need to fish the anchor from the UCA */
michael@0 1549 collIterate s;
michael@0 1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
michael@0 1551 uint32_t CE, SecondCE;
michael@0 1552 // uint32_t invPos;
michael@0 1553 if(sourceToken != NULL) {
michael@0 1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
michael@0 1555 } else {
michael@0 1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
michael@0 1557 }
michael@0 1558 if(U_FAILURE(*status)) {
michael@0 1559 return NULL;
michael@0 1560 }
michael@0 1561
michael@0 1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
michael@0 1563 baseContCE = ucol_getNextCE(src->UCA, &s, status);
michael@0 1564 if(baseContCE == UCOL_NO_MORE_CES) {
michael@0 1565 baseContCE = 0;
michael@0 1566 }
michael@0 1567
michael@0 1568
michael@0 1569 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
michael@0 1570 uint32_t ch = 0;
michael@0 1571 uint32_t expandNext = 0;
michael@0 1572 UColToken key;
michael@0 1573
michael@0 1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
michael@0 1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
michael@0 1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
michael@0 1577 ch = uprv_uca_getCodePointFromRaw(raw-1);
michael@0 1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
michael@0 1579 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
michael@0 1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
michael@0 1581
michael@0 1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
michael@0 1583 *src->extraCurrent++ = 0xFFFE;
michael@0 1584 *src->extraCurrent++ = (UChar)ch;
michael@0 1585 src->parsedToken.charsLen++;
michael@0 1586
michael@0 1587 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
michael@0 1588 key.rulesToParseHdl = &(src->source);
michael@0 1589
michael@0 1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
michael@0 1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
michael@0 1592
michael@0 1593 if(sourceToken == NULL) {
michael@0 1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
michael@0 1595 if(isContinuation(SecondCE)) {
michael@0 1596 src->lh[src->resultLen].baseContCE = SecondCE;
michael@0 1597 } else {
michael@0 1598 src->lh[src->resultLen].baseContCE = 0;
michael@0 1599 }
michael@0 1600 src->lh[src->resultLen].nextCE = 0;
michael@0 1601 src->lh[src->resultLen].nextContCE = 0;
michael@0 1602 src->lh[src->resultLen].previousCE = 0;
michael@0 1603 src->lh[src->resultLen].previousContCE = 0;
michael@0 1604
michael@0 1605 src->lh[src->resultLen].indirect = FALSE;
michael@0 1606
michael@0 1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
michael@0 1608 }
michael@0 1609
michael@0 1610 } else {
michael@0 1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
michael@0 1612
michael@0 1613 // we got the previous CE. Now we need to see if the difference between
michael@0 1614 // the two CEs is really of the requested strength.
michael@0 1615 // if it's a bigger difference (we asked for secondary and got primary), we
michael@0 1616 // need to modify the CE.
michael@0 1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
michael@0 1618 // adjust the strength
michael@0 1619 // now we are in the situation where our baseCE should actually be modified in
michael@0 1620 // order to get the CE in the right position.
michael@0 1621 if(strength == UCOL_SECONDARY) {
michael@0 1622 CE = baseCE - 0x0200;
michael@0 1623 } else { // strength == UCOL_TERTIARY
michael@0 1624 CE = baseCE - 0x02;
michael@0 1625 }
michael@0 1626 if(baseContCE) {
michael@0 1627 if(strength == UCOL_SECONDARY) {
michael@0 1628 SecondCE = baseContCE - 0x0200;
michael@0 1629 } else { // strength == UCOL_TERTIARY
michael@0 1630 SecondCE = baseContCE - 0x02;
michael@0 1631 }
michael@0 1632 }
michael@0 1633 }
michael@0 1634
michael@0 1635 #if 0
michael@0 1636 // the code below relies on getting a code point from the inverse table, in order to be
michael@0 1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
michael@0 1638 // 1. There are many code points that have the same CE
michael@0 1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
michael@0 1640 // Also, in case when there is no equivalent strength before an element, we have to actually
michael@0 1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
michael@0 1642 // before a is a primary difference.
michael@0 1643
michael@0 1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
michael@0 1645
michael@0 1646
michael@0 1647 ch = CETable[3*invPos+2];
michael@0 1648
michael@0 1649 if((ch & UCOL_INV_SIZEMASK) != 0) {
michael@0 1650 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
michael@0 1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
michael@0 1652 ch = conts[offset];
michael@0 1653 }
michael@0 1654
michael@0 1655 *src->extraCurrent++ = (UChar)ch;
michael@0 1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
michael@0 1657 src->parsedToken.charsLen = 1;
michael@0 1658
michael@0 1659 // We got an UCA before. However, this might have been tailored.
michael@0 1660 // example:
michael@0 1661 // &\u30ca = \u306a
michael@0 1662 // &[before 3]\u306a<<<\u306a|\u309d
michael@0 1663
michael@0 1664
michael@0 1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
michael@0 1666 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
michael@0 1667 key.rulesToParseHdl = &(src->source);
michael@0 1668
michael@0 1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
michael@0 1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
michael@0 1671 #endif
michael@0 1672
michael@0 1673 // here is how it should be. The situation such as &[before 1]a < x, should be
michael@0 1674 // resolved exactly as if we wrote &a > x.
michael@0 1675 // therefore, I don't really care if the UCA value before a has been changed.
michael@0 1676 // However, I do care if the strength between my element and the previous element
michael@0 1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
michael@0 1678 // have to construct the base CE.
michael@0 1679
michael@0 1680
michael@0 1681
michael@0 1682 // if we found a tailored thing, we have to use the UCA value and construct
michael@0 1683 // a new reset token with constructed name
michael@0 1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
michael@0 1685 // character to which we want to anchor is already tailored.
michael@0 1686 // We need to construct a new token which will be the anchor
michael@0 1687 // point
michael@0 1688 //*(src->extraCurrent-1) = 0xFFFE;
michael@0 1689 //*src->extraCurrent++ = (UChar)ch;
michael@0 1690 // grab before
michael@0 1691 src->parsedToken.charsOffset -= 10;
michael@0 1692 src->parsedToken.charsLen += 10;
michael@0 1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
michael@0 1694 if(isContinuation(SecondCE)) {
michael@0 1695 src->lh[src->resultLen].baseContCE = SecondCE;
michael@0 1696 } else {
michael@0 1697 src->lh[src->resultLen].baseContCE = 0;
michael@0 1698 }
michael@0 1699 src->lh[src->resultLen].nextCE = 0;
michael@0 1700 src->lh[src->resultLen].nextContCE = 0;
michael@0 1701 src->lh[src->resultLen].previousCE = 0;
michael@0 1702 src->lh[src->resultLen].previousContCE = 0;
michael@0 1703
michael@0 1704 src->lh[src->resultLen].indirect = FALSE;
michael@0 1705
michael@0 1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
michael@0 1707 //}
michael@0 1708 }
michael@0 1709
michael@0 1710 return sourceToken;
michael@0 1711
michael@0 1712 }
michael@0 1713
michael@0 1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
michael@0 1715 UColToken *lastToken = NULL;
michael@0 1716 const UChar *parseEnd = NULL;
michael@0 1717 uint32_t expandNext = 0;
michael@0 1718 UBool variableTop = FALSE;
michael@0 1719 UBool top = FALSE;
michael@0 1720 uint16_t specs = 0;
michael@0 1721 UColTokListHeader *ListList = NULL;
michael@0 1722
michael@0 1723 src->parsedToken.strength = UCOL_TOK_UNSET;
michael@0 1724
michael@0 1725 ListList = src->lh;
michael@0 1726
michael@0 1727 if(U_FAILURE(*status)) {
michael@0 1728 return 0;
michael@0 1729 }
michael@0 1730 #ifdef DEBUG_FOR_CODE_POINTS
michael@0 1731 char filename[35];
michael@0 1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
michael@0 1733 dfcp_fp = fopen(filename, "a");
michael@0 1734 fprintf(stdout, "Output is in the file %s.\n", filename);
michael@0 1735 #endif
michael@0 1736
michael@0 1737 #ifdef DEBUG_FOR_COLL_RULES
michael@0 1738 std::string s3;
michael@0 1739 UnicodeString(src->source).toUTF8String(s3);
michael@0 1740 std::cout << "src->source = " << s3 << std::endl;
michael@0 1741 #endif
michael@0 1742
michael@0 1743 while(src->current < src->end || src->isStarred) {
michael@0 1744 src->parsedToken.prefixOffset = 0;
michael@0 1745
michael@0 1746 parseEnd = ucol_tok_parseNextToken(src,
michael@0 1747 (UBool)(lastToken == NULL),
michael@0 1748 parseError,
michael@0 1749 status);
michael@0 1750
michael@0 1751 specs = src->parsedToken.flags;
michael@0 1752
michael@0 1753
michael@0 1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
michael@0 1755 top = ((specs & UCOL_TOK_TOP) != 0);
michael@0 1756
michael@0 1757 if(U_SUCCESS(*status) && parseEnd != NULL) {
michael@0 1758 UColToken *sourceToken = NULL;
michael@0 1759 //uint32_t key = 0;
michael@0 1760 uint32_t lastStrength = UCOL_TOK_UNSET;
michael@0 1761
michael@0 1762 if(lastToken != NULL ) {
michael@0 1763 lastStrength = lastToken->strength;
michael@0 1764 }
michael@0 1765
michael@0 1766 #ifdef DEBUG_FOR_CODE_POINTS
michael@0 1767 UChar32 cp;
michael@0 1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
michael@0 1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
michael@0 1770 #endif
michael@0 1771 //key = newCharsLen << 24 | charsOffset;
michael@0 1772 UColToken key;
michael@0 1773 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
michael@0 1774 key.rulesToParseHdl = &(src->source);
michael@0 1775
michael@0 1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
michael@0 1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
michael@0 1778
michael@0 1779 if(src->parsedToken.strength != UCOL_TOK_RESET) {
michael@0 1780 if(lastToken == NULL) { /* this means that rules haven't started properly */
michael@0 1781 *status = U_INVALID_FORMAT_ERROR;
michael@0 1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
michael@0 1783 DBG_FORMAT_ERROR
michael@0 1784 return 0;
michael@0 1785 }
michael@0 1786 /* 6 Otherwise (when relation != reset) */
michael@0 1787 if(sourceToken == NULL) {
michael@0 1788 /* If sourceToken is null, create new one, */
michael@0 1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
michael@0 1790 /* test for NULL */
michael@0 1791 if (sourceToken == NULL) {
michael@0 1792 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1793 return 0;
michael@0 1794 }
michael@0 1795 sourceToken->rulesToParseHdl = &(src->source);
michael@0 1796 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
michael@0 1797
michael@0 1798 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
michael@0 1799
michael@0 1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
michael@0 1801 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
michael@0 1802
michael@0 1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
michael@0 1804 sourceToken->next = NULL;
michael@0 1805 sourceToken->previous = NULL;
michael@0 1806 sourceToken->noOfCEs = 0;
michael@0 1807 sourceToken->noOfExpCEs = 0;
michael@0 1808 // keep the flags around so that we know about before
michael@0 1809 sourceToken->flags = src->parsedToken.flags;
michael@0 1810 uhash_put(src->tailored, sourceToken, sourceToken, status);
michael@0 1811 if(U_FAILURE(*status)) {
michael@0 1812 return 0;
michael@0 1813 }
michael@0 1814 } else {
michael@0 1815 /* we could have fished out a reset here */
michael@0 1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
michael@0 1817 /* otherwise remove sourceToken from where it was. */
michael@0 1818 if(sourceToken->next != NULL) {
michael@0 1819 if(sourceToken->next->strength > sourceToken->strength) {
michael@0 1820 sourceToken->next->strength = sourceToken->strength;
michael@0 1821 }
michael@0 1822 sourceToken->next->previous = sourceToken->previous;
michael@0 1823 } else {
michael@0 1824 sourceToken->listHeader->last = sourceToken->previous;
michael@0 1825 }
michael@0 1826
michael@0 1827 if(sourceToken->previous != NULL) {
michael@0 1828 sourceToken->previous->next = sourceToken->next;
michael@0 1829 } else {
michael@0 1830 sourceToken->listHeader->first = sourceToken->next;
michael@0 1831 }
michael@0 1832 sourceToken->next = NULL;
michael@0 1833 sourceToken->previous = NULL;
michael@0 1834 }
michael@0 1835 }
michael@0 1836
michael@0 1837 sourceToken->strength = src->parsedToken.strength;
michael@0 1838 sourceToken->listHeader = lastToken->listHeader;
michael@0 1839
michael@0 1840 /*
michael@0 1841 1. Find the strongest strength in each list, and set strongestP and strongestN
michael@0 1842 accordingly in the headers.
michael@0 1843 */
michael@0 1844 if(lastStrength == UCOL_TOK_RESET
michael@0 1845 || sourceToken->listHeader->first == 0) {
michael@0 1846 /* If LAST is a reset
michael@0 1847 insert sourceToken in the list. */
michael@0 1848 if(sourceToken->listHeader->first == 0) {
michael@0 1849 sourceToken->listHeader->first = sourceToken;
michael@0 1850 sourceToken->listHeader->last = sourceToken;
michael@0 1851 } else { /* we need to find a place for us */
michael@0 1852 /* and we'll get in front of the same strength */
michael@0 1853 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
michael@0 1854 sourceToken->next = sourceToken->listHeader->first;
michael@0 1855 sourceToken->next->previous = sourceToken;
michael@0 1856 sourceToken->listHeader->first = sourceToken;
michael@0 1857 sourceToken->previous = NULL;
michael@0 1858 } else {
michael@0 1859 lastToken = sourceToken->listHeader->first;
michael@0 1860 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
michael@0 1861 lastToken = lastToken->next;
michael@0 1862 }
michael@0 1863 if(lastToken->next != NULL) {
michael@0 1864 lastToken->next->previous = sourceToken;
michael@0 1865 } else {
michael@0 1866 sourceToken->listHeader->last = sourceToken;
michael@0 1867 }
michael@0 1868 sourceToken->previous = lastToken;
michael@0 1869 sourceToken->next = lastToken->next;
michael@0 1870 lastToken->next = sourceToken;
michael@0 1871 }
michael@0 1872 }
michael@0 1873 } else {
michael@0 1874 /* Otherwise (when LAST is not a reset)
michael@0 1875 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
michael@0 1876 otherwise insert before.
michael@0 1877 when inserting after or before, search to the next position with the same
michael@0 1878 strength in that direction. (This is called postpone insertion). */
michael@0 1879 if(sourceToken != lastToken) {
michael@0 1880 if(lastToken->polarity == sourceToken->polarity) {
michael@0 1881 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
michael@0 1882 lastToken = lastToken->next;
michael@0 1883 }
michael@0 1884 sourceToken->previous = lastToken;
michael@0 1885 if(lastToken->next != NULL) {
michael@0 1886 lastToken->next->previous = sourceToken;
michael@0 1887 } else {
michael@0 1888 sourceToken->listHeader->last = sourceToken;
michael@0 1889 }
michael@0 1890
michael@0 1891 sourceToken->next = lastToken->next;
michael@0 1892 lastToken->next = sourceToken;
michael@0 1893 } else {
michael@0 1894 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
michael@0 1895 lastToken = lastToken->previous;
michael@0 1896 }
michael@0 1897 sourceToken->next = lastToken;
michael@0 1898 if(lastToken->previous != NULL) {
michael@0 1899 lastToken->previous->next = sourceToken;
michael@0 1900 } else {
michael@0 1901 sourceToken->listHeader->first = sourceToken;
michael@0 1902 }
michael@0 1903 sourceToken->previous = lastToken->previous;
michael@0 1904 lastToken->previous = sourceToken;
michael@0 1905 }
michael@0 1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */
michael@0 1907 if(lastStrength < sourceToken->strength) {
michael@0 1908 sourceToken->strength = lastStrength;
michael@0 1909 }
michael@0 1910 }
michael@0 1911 }
michael@0 1912
michael@0 1913 /* if the token was a variable top, we're gonna put it in */
michael@0 1914 if(variableTop == TRUE && src->varTop == NULL) {
michael@0 1915 variableTop = FALSE;
michael@0 1916 src->varTop = sourceToken;
michael@0 1917 }
michael@0 1918
michael@0 1919 // Treat the expansions.
michael@0 1920 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
michael@0 1921 // (&abc * d * e <=> &ab * d / c * e / c)
michael@0 1922 // if both of them are in effect for a token, they are combined.
michael@0 1923
michael@0 1924 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
michael@0 1925
michael@0 1926 if(expandNext != 0) {
michael@0 1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
michael@0 1928 expandNext = 0;
michael@0 1929 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
michael@0 1930 sourceToken->expansion = expandNext;
michael@0 1931 } else { /* there is both explicit and implicit expansion. We need to make a combination */
michael@0 1932 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
michael@0 1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
michael@0 1934 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
michael@0 1935 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
michael@0 1936 }
michael@0 1937 }
michael@0 1938
michael@0 1939 // This is just for debugging purposes
michael@0 1940 if(sourceToken->expansion != 0) {
michael@0 1941 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
michael@0 1942 } else {
michael@0 1943 sourceToken->debugExpansion = 0;
michael@0 1944 }
michael@0 1945 // if the previous token was a reset before, the strength of this
michael@0 1946 // token must match the strength of before. Otherwise we have an
michael@0 1947 // undefined situation.
michael@0 1948 // In other words, we currently have a cludge which we use to
michael@0 1949 // represent &a >> x. This is written as &[before 2]a << x.
michael@0 1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
michael@0 1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
michael@0 1952 if(beforeStrength != sourceToken->strength) {
michael@0 1953 *status = U_INVALID_FORMAT_ERROR;
michael@0 1954 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
michael@0 1955 DBG_FORMAT_ERROR
michael@0 1956 return 0;
michael@0 1957 }
michael@0 1958 }
michael@0 1959 } else {
michael@0 1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
michael@0 1961 /* if the previous token was also a reset, */
michael@0 1962 /*this means that we have two consecutive resets */
michael@0 1963 /* and we want to remove the previous one if empty*/
michael@0 1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
michael@0 1965 src->resultLen--;
michael@0 1966 }
michael@0 1967 }
michael@0 1968
michael@0 1969 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
michael@0 1970 uint32_t searchCharsLen = src->parsedToken.charsLen;
michael@0 1971 while(searchCharsLen > 1 && sourceToken == NULL) {
michael@0 1972 searchCharsLen--;
michael@0 1973 //key = searchCharsLen << 24 | charsOffset;
michael@0 1974 UColToken key;
michael@0 1975 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
michael@0 1976 key.rulesToParseHdl = &(src->source);
michael@0 1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
michael@0 1978 }
michael@0 1979 if(sourceToken != NULL) {
michael@0 1980 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
michael@0 1981 }
michael@0 1982 }
michael@0 1983
michael@0 1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
michael@0 1985 if(top == FALSE) { /* there is no indirection */
michael@0 1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
michael@0 1987 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
michael@0 1988 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
michael@0 1989 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
michael@0 1990 sourceToken = sourceToken->previous;
michael@0 1991 }
michael@0 1992 /* here, either we hit the strength or NULL */
michael@0 1993 if(sourceToken->strength == strength) {
michael@0 1994 if(sourceToken->previous != NULL) {
michael@0 1995 sourceToken = sourceToken->previous;
michael@0 1996 } else { /* start of list */
michael@0 1997 sourceToken = sourceToken->listHeader->reset;
michael@0 1998 }
michael@0 1999 } else { /* we hit NULL */
michael@0 2000 /* we should be doing the else part */
michael@0 2001 sourceToken = sourceToken->listHeader->reset;
michael@0 2002 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
michael@0 2003 }
michael@0 2004 } else {
michael@0 2005 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
michael@0 2006 }
michael@0 2007 } else { /* this is both before and indirection */
michael@0 2008 top = FALSE;
michael@0 2009 ListList[src->resultLen].previousCE = 0;
michael@0 2010 ListList[src->resultLen].previousContCE = 0;
michael@0 2011 ListList[src->resultLen].indirect = TRUE;
michael@0 2012 /* we need to do slightly more work. we need to get the baseCE using the */
michael@0 2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
michael@0 2014 /* in ucol_bld */
michael@0 2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
michael@0 2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
michael@0 2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
michael@0 2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
michael@0 2019
michael@0 2020 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
michael@0 2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
michael@0 2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
michael@0 2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
michael@0 2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
michael@0 2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
michael@0 2026 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
michael@0 2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
michael@0 2028 } else {
michael@0 2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
michael@0 2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
michael@0 2031 }
michael@0 2032
michael@0 2033 ListList[src->resultLen].baseCE = CE;
michael@0 2034 ListList[src->resultLen].baseContCE = SecondCE;
michael@0 2035 ListList[src->resultLen].nextCE = 0;
michael@0 2036 ListList[src->resultLen].nextContCE = 0;
michael@0 2037
michael@0 2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
michael@0 2039 }
michael@0 2040 }
michael@0 2041
michael@0 2042
michael@0 2043 /* 5 If the relation is a reset:
michael@0 2044 If sourceToken is null
michael@0 2045 Create new list, create new sourceToken, make the baseCE from source, put
michael@0 2046 the sourceToken in ListHeader of the new list */
michael@0 2047 if(sourceToken == NULL) {
michael@0 2048 /*
michael@0 2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
michael@0 2050 First convert all expansions into normal form. Examples:
michael@0 2051 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
michael@0 2052 d * ... into &x * c/y * d * ...
michael@0 2053 Note: reset values can never have expansions, although they can cause the
michael@0 2054 very next item to have one. They may be contractions, if they are found
michael@0 2055 earlier in the list.
michael@0 2056 */
michael@0 2057 if(top == FALSE) {
michael@0 2058 collIterate s;
michael@0 2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
michael@0 2060
michael@0 2061 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
michael@0 2062
michael@0 2063 CE = ucol_getNextCE(src->UCA, &s, status);
michael@0 2064 const UChar *expand = s.pos;
michael@0 2065 SecondCE = ucol_getNextCE(src->UCA, &s, status);
michael@0 2066
michael@0 2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
michael@0 2068 if(isContinuation(SecondCE)) {
michael@0 2069 ListList[src->resultLen].baseContCE = SecondCE;
michael@0 2070 } else {
michael@0 2071 ListList[src->resultLen].baseContCE = 0;
michael@0 2072 }
michael@0 2073 ListList[src->resultLen].nextCE = 0;
michael@0 2074 ListList[src->resultLen].nextContCE = 0;
michael@0 2075 ListList[src->resultLen].previousCE = 0;
michael@0 2076 ListList[src->resultLen].previousContCE = 0;
michael@0 2077 ListList[src->resultLen].indirect = FALSE;
michael@0 2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
michael@0 2079 } else { /* top == TRUE */
michael@0 2080 /* just use the supplied values */
michael@0 2081 top = FALSE;
michael@0 2082 ListList[src->resultLen].previousCE = 0;
michael@0 2083 ListList[src->resultLen].previousContCE = 0;
michael@0 2084 ListList[src->resultLen].indirect = TRUE;
michael@0 2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
michael@0 2086 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
michael@0 2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
michael@0 2088 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
michael@0 2089
michael@0 2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
michael@0 2091
michael@0 2092 }
michael@0 2093 } else { /* reset to something already in rules */
michael@0 2094 top = FALSE;
michael@0 2095 }
michael@0 2096 }
michael@0 2097 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
michael@0 2098 lastToken = sourceToken;
michael@0 2099 } else {
michael@0 2100 if(U_FAILURE(*status)) {
michael@0 2101 return 0;
michael@0 2102 }
michael@0 2103 }
michael@0 2104 }
michael@0 2105 #ifdef DEBUG_FOR_CODE_POINTS
michael@0 2106 fclose(dfcp_fp);
michael@0 2107 #endif
michael@0 2108
michael@0 2109
michael@0 2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
michael@0 2111 src->resultLen--;
michael@0 2112 }
michael@0 2113 return src->resultLen;
michael@0 2114 }
michael@0 2115
michael@0 2116 const UChar* ucol_tok_getRulesFromBundle(
michael@0 2117 void* /*context*/,
michael@0 2118 const char* locale,
michael@0 2119 const char* type,
michael@0 2120 int32_t* pLength,
michael@0 2121 UErrorCode* status)
michael@0 2122 {
michael@0 2123 const UChar* rules = NULL;
michael@0 2124 UResourceBundle* bundle;
michael@0 2125 UResourceBundle* collations;
michael@0 2126 UResourceBundle* collation;
michael@0 2127
michael@0 2128 *pLength = 0;
michael@0 2129
michael@0 2130 bundle = ures_open(U_ICUDATA_COLL, locale, status);
michael@0 2131 if(U_SUCCESS(*status)){
michael@0 2132 collations = ures_getByKey(bundle, "collations", NULL, status);
michael@0 2133 if(U_SUCCESS(*status)){
michael@0 2134 collation = ures_getByKey(collations, type, NULL, status);
michael@0 2135 if(U_SUCCESS(*status)){
michael@0 2136 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
michael@0 2137 if(U_FAILURE(*status)){
michael@0 2138 *pLength = 0;
michael@0 2139 rules = NULL;
michael@0 2140 }
michael@0 2141 ures_close(collation);
michael@0 2142 }
michael@0 2143 ures_close(collations);
michael@0 2144 }
michael@0 2145 }
michael@0 2146
michael@0 2147 ures_close(bundle);
michael@0 2148
michael@0 2149 return rules;
michael@0 2150 }
michael@0 2151
michael@0 2152 void ucol_tok_initTokenList(
michael@0 2153 UColTokenParser *src,
michael@0 2154 const UChar *rules,
michael@0 2155 uint32_t rulesLength,
michael@0 2156 const UCollator *UCA,
michael@0 2157 GetCollationRulesFunction importFunc,
michael@0 2158 void* context,
michael@0 2159 UErrorCode *status) {
michael@0 2160 U_NAMESPACE_USE
michael@0 2161
michael@0 2162 uint32_t nSize = 0;
michael@0 2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
michael@0 2164
michael@0 2165 bool needToDeallocRules = false;
michael@0 2166
michael@0 2167 if(U_FAILURE(*status)) {
michael@0 2168 return;
michael@0 2169 }
michael@0 2170
michael@0 2171 // set everything to zero, so that we can clean up gracefully
michael@0 2172 uprv_memset(src, 0, sizeof(UColTokenParser));
michael@0 2173
michael@0 2174 // first we need to find options that don't like to be normalized,
michael@0 2175 // like copy and remove...
michael@0 2176 //const UChar *openBrace = rules;
michael@0 2177 int32_t optionNumber = -1;
michael@0 2178 const UChar *setStart = NULL;
michael@0 2179 uint32_t i = 0;
michael@0 2180 while(i < rulesLength) {
michael@0 2181 if(rules[i] == 0x005B) { // '[': start of an option
michael@0 2182 /* Gets the following:
michael@0 2183 optionNumber: The index of the option.
michael@0 2184 setStart: The pointer at which the option arguments start.
michael@0 2185 */
michael@0 2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
michael@0 2187
michael@0 2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
michael@0 2189 // [optimize]
michael@0 2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
michael@0 2191 if(U_SUCCESS(*status)) {
michael@0 2192 if(src->copySet == NULL) {
michael@0 2193 src->copySet = newSet;
michael@0 2194 } else {
michael@0 2195 uset_addAll(src->copySet, newSet);
michael@0 2196 uset_close(newSet);
michael@0 2197 }
michael@0 2198 } else {
michael@0 2199 return;
michael@0 2200 }
michael@0 2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
michael@0 2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
michael@0 2203 if(U_SUCCESS(*status)) {
michael@0 2204 if(src->removeSet == NULL) {
michael@0 2205 src->removeSet = newSet;
michael@0 2206 } else {
michael@0 2207 uset_addAll(src->removeSet, newSet);
michael@0 2208 uset_close(newSet);
michael@0 2209 }
michael@0 2210 } else {
michael@0 2211 return;
michael@0 2212 }
michael@0 2213 } else if(optionNumber == OPTION_IMPORT){
michael@0 2214 // [import <collation-name>]
michael@0 2215
michael@0 2216 // Find the address of the closing ].
michael@0 2217 UChar* import_end = u_strchr(setStart, 0x005D);
michael@0 2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
michael@0 2219 // Ignore trailing whitespace.
michael@0 2220 while(PatternProps::isWhiteSpace(*(import_end-1))) {
michael@0 2221 --import_end;
michael@0 2222 }
michael@0 2223
michael@0 2224 int32_t optionLength = (int32_t)(import_end - setStart);
michael@0 2225 char option[50];
michael@0 2226 if(optionLength >= (int32_t)sizeof(option)) {
michael@0 2227 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2228 return;
michael@0 2229 }
michael@0 2230 u_UCharsToChars(setStart, option, optionLength);
michael@0 2231 option[optionLength] = 0;
michael@0 2232
michael@0 2233 *status = U_ZERO_ERROR;
michael@0 2234 char locale[50];
michael@0 2235 int32_t templ;
michael@0 2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
michael@0 2237 if(U_FAILURE(*status)) {
michael@0 2238 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2239 return;
michael@0 2240 }
michael@0 2241
michael@0 2242 char type[50];
michael@0 2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
michael@0 2244 U_FAILURE(*status)
michael@0 2245 ) {
michael@0 2246 *status = U_ZERO_ERROR;
michael@0 2247 uprv_strcpy(type, "standard");
michael@0 2248 }
michael@0 2249
michael@0 2250 // TODO: Use public functions when available, see ticket #8134.
michael@0 2251 char *keywords = (char *)locale_getKeywordsStart(locale);
michael@0 2252 if(keywords != NULL) {
michael@0 2253 *keywords = 0;
michael@0 2254 }
michael@0 2255
michael@0 2256 int32_t importRulesLength = 0;
michael@0 2257 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
michael@0 2258
michael@0 2259 #ifdef DEBUG_FOR_COLL_RULES
michael@0 2260 std::string s;
michael@0 2261 UnicodeString(importRules).toUTF8String(s);
michael@0 2262 std::cout << "Import rules = " << s << std::endl;
michael@0 2263 #endif
michael@0 2264
michael@0 2265 // Add the length of the imported rules to length of the original rules,
michael@0 2266 // and subtract the length of the import option.
michael@0 2267 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
michael@0 2268
michael@0 2269 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
michael@0 2270
michael@0 2271 #ifdef DEBUG_FOR_COLL_RULES
michael@0 2272 std::string s1;
michael@0 2273 UnicodeString(rules).toUTF8String(s1);
michael@0 2274 std::cout << "Original rules = " << s1 << std::endl;
michael@0 2275 #endif
michael@0 2276
michael@0 2277
michael@0 2278 // Copy the section of the original rules leading up to the import
michael@0 2279 uprv_memcpy(newRules, rules, i*sizeof(UChar));
michael@0 2280 // Copy the imported rules
michael@0 2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
michael@0 2282 // Copy the rest of the original rules (minus the import option itself)
michael@0 2283 uprv_memcpy(newRules+i+importRulesLength,
michael@0 2284 rules+optionEndOffset,
michael@0 2285 (rulesLength-optionEndOffset)*sizeof(UChar));
michael@0 2286
michael@0 2287 #ifdef DEBUG_FOR_COLL_RULES
michael@0 2288 std::string s2;
michael@0 2289 UnicodeString(newRules).toUTF8String(s2);
michael@0 2290 std::cout << "Resulting rules = " << s2 << std::endl;
michael@0 2291 #endif
michael@0 2292
michael@0 2293 if(needToDeallocRules){
michael@0 2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
michael@0 2295 uprv_free((void*)rules);
michael@0 2296 }
michael@0 2297 needToDeallocRules = true;
michael@0 2298 rules = newRules;
michael@0 2299 rulesLength = newRulesLength;
michael@0 2300
michael@0 2301 estimatedSize += importRulesLength*2;
michael@0 2302
michael@0 2303 // First character of the new rules needs to be processed
michael@0 2304 i--;
michael@0 2305 }
michael@0 2306 }
michael@0 2307 //openBrace++;
michael@0 2308 i++;
michael@0 2309 }
michael@0 2310
michael@0 2311 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
michael@0 2312 /* test for NULL */
michael@0 2313 if (src->source == NULL) {
michael@0 2314 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2315 return;
michael@0 2316 }
michael@0 2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
michael@0 2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
michael@0 2319 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
michael@0 2320 *status = U_ZERO_ERROR;
michael@0 2321 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
michael@0 2322 /* test for NULL */
michael@0 2323 if (src->source == NULL) {
michael@0 2324 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2325 return;
michael@0 2326 }
michael@0 2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
michael@0 2328 }
michael@0 2329 if(needToDeallocRules){
michael@0 2330 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
michael@0 2331 uprv_free((void*)rules);
michael@0 2332 }
michael@0 2333
michael@0 2334
michael@0 2335 src->current = src->source;
michael@0 2336 src->end = src->source+nSize;
michael@0 2337 src->sourceCurrent = src->source;
michael@0 2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
michael@0 2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
michael@0 2340 src->varTop = NULL;
michael@0 2341 src->UCA = UCA;
michael@0 2342 src->invUCA = ucol_initInverseUCA(status);
michael@0 2343 src->parsedToken.charsLen = 0;
michael@0 2344 src->parsedToken.charsOffset = 0;
michael@0 2345 src->parsedToken.extensionLen = 0;
michael@0 2346 src->parsedToken.extensionOffset = 0;
michael@0 2347 src->parsedToken.prefixLen = 0;
michael@0 2348 src->parsedToken.prefixOffset = 0;
michael@0 2349 src->parsedToken.flags = 0;
michael@0 2350 src->parsedToken.strength = UCOL_TOK_UNSET;
michael@0 2351 src->buildCCTabFlag = FALSE;
michael@0 2352 src->isStarred = FALSE;
michael@0 2353 src->inRange = FALSE;
michael@0 2354 src->lastRangeCp = 0;
michael@0 2355 src->previousCp = 0;
michael@0 2356
michael@0 2357 if(U_FAILURE(*status)) {
michael@0 2358 return;
michael@0 2359 }
michael@0 2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
michael@0 2361 if(U_FAILURE(*status)) {
michael@0 2362 return;
michael@0 2363 }
michael@0 2364 uhash_setValueDeleter(src->tailored, uprv_free);
michael@0 2365
michael@0 2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
michael@0 2367 /* test for NULL */
michael@0 2368 if (src->opts == NULL) {
michael@0 2369 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2370 return;
michael@0 2371 }
michael@0 2372
michael@0 2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
michael@0 2374
michael@0 2375 src->lh = 0;
michael@0 2376 src->listCapacity = 1024;
michael@0 2377 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
michael@0 2378 //Test for NULL
michael@0 2379 if (src->lh == NULL) {
michael@0 2380 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2381 return;
michael@0 2382 }
michael@0 2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
michael@0 2384 src->resultLen = 0;
michael@0 2385
michael@0 2386 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
michael@0 2387
michael@0 2388 // UCOL_RESET_TOP_VALUE
michael@0 2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
michael@0 2390 // UCOL_FIRST_PRIMARY_IGNORABLE
michael@0 2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
michael@0 2392 // UCOL_LAST_PRIMARY_IGNORABLE
michael@0 2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
michael@0 2394 // UCOL_FIRST_SECONDARY_IGNORABLE
michael@0 2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
michael@0 2396 // UCOL_LAST_SECONDARY_IGNORABLE
michael@0 2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
michael@0 2398 // UCOL_FIRST_TERTIARY_IGNORABLE
michael@0 2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
michael@0 2400 // UCOL_LAST_TERTIARY_IGNORABLE
michael@0 2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
michael@0 2402 // UCOL_FIRST_VARIABLE
michael@0 2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
michael@0 2404 // UCOL_LAST_VARIABLE
michael@0 2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
michael@0 2406 // UCOL_FIRST_NON_VARIABLE
michael@0 2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
michael@0 2408 // UCOL_LAST_NON_VARIABLE
michael@0 2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
michael@0 2410 // UCOL_FIRST_IMPLICIT
michael@0 2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
michael@0 2412 // UCOL_LAST_IMPLICIT
michael@0 2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
michael@0 2414 // UCOL_FIRST_TRAILING
michael@0 2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
michael@0 2416 // UCOL_LAST_TRAILING
michael@0 2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
michael@0 2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
michael@0 2419 }
michael@0 2420
michael@0 2421
michael@0 2422 void ucol_tok_closeTokenList(UColTokenParser *src) {
michael@0 2423 if(src->copySet != NULL) {
michael@0 2424 uset_close(src->copySet);
michael@0 2425 }
michael@0 2426 if(src->removeSet != NULL) {
michael@0 2427 uset_close(src->removeSet);
michael@0 2428 }
michael@0 2429 if(src->tailored != NULL) {
michael@0 2430 uhash_close(src->tailored);
michael@0 2431 }
michael@0 2432 if(src->lh != NULL) {
michael@0 2433 uprv_free(src->lh);
michael@0 2434 }
michael@0 2435 if(src->source != NULL) {
michael@0 2436 uprv_free(src->source);
michael@0 2437 }
michael@0 2438 if(src->opts != NULL) {
michael@0 2439 uprv_free(src->opts);
michael@0 2440 }
michael@0 2441 if (src->reorderCodes != NULL) {
michael@0 2442 uprv_free(src->reorderCodes);
michael@0 2443 }
michael@0 2444 }
michael@0 2445
michael@0 2446 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial