Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2001-2012, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: ucol_tok.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created 02/22/2001 |
michael@0 | 14 | * created by: Vladimir Weinstein |
michael@0 | 15 | * |
michael@0 | 16 | * This module reads a tailoring rule string and produces a list of |
michael@0 | 17 | * tokens that will be turned into collation elements |
michael@0 | 18 | * |
michael@0 | 19 | */ |
michael@0 | 20 | |
michael@0 | 21 | #include "unicode/utypes.h" |
michael@0 | 22 | |
michael@0 | 23 | #if !UCONFIG_NO_COLLATION |
michael@0 | 24 | |
michael@0 | 25 | #include "unicode/uscript.h" |
michael@0 | 26 | #include "unicode/ustring.h" |
michael@0 | 27 | #include "unicode/uchar.h" |
michael@0 | 28 | #include "unicode/uniset.h" |
michael@0 | 29 | |
michael@0 | 30 | #include "cmemory.h" |
michael@0 | 31 | #include "cstring.h" |
michael@0 | 32 | #include "patternprops.h" |
michael@0 | 33 | #include "ucol_bld.h" |
michael@0 | 34 | #include "ucol_tok.h" |
michael@0 | 35 | #include "ulocimp.h" |
michael@0 | 36 | #include "uresimp.h" |
michael@0 | 37 | |
michael@0 | 38 | // Define this only for debugging. |
michael@0 | 39 | // #define DEBUG_FOR_COLL_RULES 1 |
michael@0 | 40 | |
michael@0 | 41 | #ifdef DEBUG_FOR_COLL_RULES |
michael@0 | 42 | #include <iostream> |
michael@0 | 43 | #endif |
michael@0 | 44 | |
michael@0 | 45 | U_NAMESPACE_USE |
michael@0 | 46 | |
michael@0 | 47 | U_CDECL_BEGIN |
michael@0 | 48 | static int32_t U_CALLCONV |
michael@0 | 49 | uhash_hashTokens(const UHashTok k) |
michael@0 | 50 | { |
michael@0 | 51 | int32_t hash = 0; |
michael@0 | 52 | //uint32_t key = (uint32_t)k.integer; |
michael@0 | 53 | UColToken *key = (UColToken *)k.pointer; |
michael@0 | 54 | if (key != 0) { |
michael@0 | 55 | int32_t len = (key->source & 0xFF000000)>>24; |
michael@0 | 56 | int32_t inc = ((len - 32) / 32) + 1; |
michael@0 | 57 | |
michael@0 | 58 | const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); |
michael@0 | 59 | const UChar *limit = p + len; |
michael@0 | 60 | |
michael@0 | 61 | while (p<limit) { |
michael@0 | 62 | hash = (hash * 37) + *p; |
michael@0 | 63 | p += inc; |
michael@0 | 64 | } |
michael@0 | 65 | } |
michael@0 | 66 | return hash; |
michael@0 | 67 | } |
michael@0 | 68 | |
michael@0 | 69 | static UBool U_CALLCONV |
michael@0 | 70 | uhash_compareTokens(const UHashTok key1, const UHashTok key2) |
michael@0 | 71 | { |
michael@0 | 72 | //uint32_t p1 = (uint32_t) key1.integer; |
michael@0 | 73 | //uint32_t p2 = (uint32_t) key2.integer; |
michael@0 | 74 | UColToken *p1 = (UColToken *)key1.pointer; |
michael@0 | 75 | UColToken *p2 = (UColToken *)key2.pointer; |
michael@0 | 76 | const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); |
michael@0 | 77 | const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); |
michael@0 | 78 | uint32_t s1L = ((p1->source & 0xFF000000) >> 24); |
michael@0 | 79 | uint32_t s2L = ((p2->source & 0xFF000000) >> 24); |
michael@0 | 80 | const UChar *end = s1+s1L-1; |
michael@0 | 81 | |
michael@0 | 82 | if (p1 == p2) { |
michael@0 | 83 | return TRUE; |
michael@0 | 84 | } |
michael@0 | 85 | if (p1->source == 0 || p2->source == 0) { |
michael@0 | 86 | return FALSE; |
michael@0 | 87 | } |
michael@0 | 88 | if(s1L != s2L) { |
michael@0 | 89 | return FALSE; |
michael@0 | 90 | } |
michael@0 | 91 | if(p1->source == p2->source) { |
michael@0 | 92 | return TRUE; |
michael@0 | 93 | } |
michael@0 | 94 | while((s1 < end) && *s1 == *s2) { |
michael@0 | 95 | ++s1; |
michael@0 | 96 | ++s2; |
michael@0 | 97 | } |
michael@0 | 98 | if(*s1 == *s2) { |
michael@0 | 99 | return TRUE; |
michael@0 | 100 | } else { |
michael@0 | 101 | return FALSE; |
michael@0 | 102 | } |
michael@0 | 103 | } |
michael@0 | 104 | U_CDECL_END |
michael@0 | 105 | |
michael@0 | 106 | /* |
michael@0 | 107 | * Debug messages used to pinpoint where a format error occurred. |
michael@0 | 108 | * A better way is to include context-sensitive information in syntaxError() function. |
michael@0 | 109 | * |
michael@0 | 110 | * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR |
michael@0 | 111 | * in the compile line. |
michael@0 | 112 | */ |
michael@0 | 113 | /* #define DEBUG_FOR_FORMAT_ERROR 1 */ |
michael@0 | 114 | |
michael@0 | 115 | #ifdef DEBUG_FOR_FORMAT_ERROR |
michael@0 | 116 | #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} |
michael@0 | 117 | #else |
michael@0 | 118 | #define DBG_FORMAT_ERROR |
michael@0 | 119 | #endif |
michael@0 | 120 | |
michael@0 | 121 | |
michael@0 | 122 | /* |
michael@0 | 123 | * Controls debug messages so that the output can be compared before and after a |
michael@0 | 124 | * big change. Prints the information of every code point that comes out of the |
michael@0 | 125 | * collation parser and its strength into a file. When a big change in format |
michael@0 | 126 | * happens, the files before and after the change should be identical. |
michael@0 | 127 | * |
michael@0 | 128 | * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS |
michael@0 | 129 | * in the compile line. |
michael@0 | 130 | */ |
michael@0 | 131 | // #define DEBUG_FOR_CODE_POINTS 1 |
michael@0 | 132 | |
michael@0 | 133 | #ifdef DEBUG_FOR_CODE_POINTS |
michael@0 | 134 | FILE* dfcp_fp = NULL; |
michael@0 | 135 | #endif |
michael@0 | 136 | |
michael@0 | 137 | |
michael@0 | 138 | typedef struct { |
michael@0 | 139 | uint32_t startCE; |
michael@0 | 140 | uint32_t startContCE; |
michael@0 | 141 | uint32_t limitCE; |
michael@0 | 142 | uint32_t limitContCE; |
michael@0 | 143 | } indirectBoundaries; |
michael@0 | 144 | |
michael@0 | 145 | /* these values are used for finding CE values for indirect positioning. */ |
michael@0 | 146 | /* Indirect positioning is a mechanism for allowing resets on symbolic */ |
michael@0 | 147 | /* values. It only works for resets and you cannot tailor indirect names */ |
michael@0 | 148 | /* An indirect name can define either an anchor point or a range. An */ |
michael@0 | 149 | /* anchor point behaves in exactly the same way as a code point in reset */ |
michael@0 | 150 | /* would, except that it cannot be tailored. A range (we currently only */ |
michael@0 | 151 | /* know for the [top] range will explicitly set the upper bound for */ |
michael@0 | 152 | /* generated CEs, thus allowing for better control over how many CEs can */ |
michael@0 | 153 | /* be squeezed between in the range without performance penalty. */ |
michael@0 | 154 | /* In that respect, we use [top] for tailoring of locales that use CJK */ |
michael@0 | 155 | /* characters. Other indirect values are currently a pure convenience, */ |
michael@0 | 156 | /* they can be used to assure that the CEs will be always positioned in */ |
michael@0 | 157 | /* the same place relative to a point with known properties (e.g. first */ |
michael@0 | 158 | /* primary ignorable). */ |
michael@0 | 159 | static indirectBoundaries ucolIndirectBoundaries[15]; |
michael@0 | 160 | /* |
michael@0 | 161 | static indirectBoundaries ucolIndirectBoundaries[11] = { |
michael@0 | 162 | { UCOL_RESET_TOP_VALUE, 0, |
michael@0 | 163 | UCOL_NEXT_TOP_VALUE, 0 }, |
michael@0 | 164 | { UCOL_FIRST_PRIMARY_IGNORABLE, 0, |
michael@0 | 165 | 0, 0 }, |
michael@0 | 166 | { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, |
michael@0 | 167 | 0, 0 }, |
michael@0 | 168 | { UCOL_FIRST_SECONDARY_IGNORABLE, 0, |
michael@0 | 169 | 0, 0 }, |
michael@0 | 170 | { UCOL_LAST_SECONDARY_IGNORABLE, 0, |
michael@0 | 171 | 0, 0 }, |
michael@0 | 172 | { UCOL_FIRST_TERTIARY_IGNORABLE, 0, |
michael@0 | 173 | 0, 0 }, |
michael@0 | 174 | { UCOL_LAST_TERTIARY_IGNORABLE, 0, |
michael@0 | 175 | 0, 0 }, |
michael@0 | 176 | { UCOL_FIRST_VARIABLE, 0, |
michael@0 | 177 | 0, 0 }, |
michael@0 | 178 | { UCOL_LAST_VARIABLE, 0, |
michael@0 | 179 | 0, 0 }, |
michael@0 | 180 | { UCOL_FIRST_NON_VARIABLE, 0, |
michael@0 | 181 | 0, 0 }, |
michael@0 | 182 | { UCOL_LAST_NON_VARIABLE, 0, |
michael@0 | 183 | 0, 0 }, |
michael@0 | 184 | }; |
michael@0 | 185 | */ |
michael@0 | 186 | |
michael@0 | 187 | static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { |
michael@0 | 188 | |
michael@0 | 189 | // Set values for the top - TODO: once we have values for all the indirects, we are going |
michael@0 | 190 | // to initalize here. |
michael@0 | 191 | ucolIndirectBoundaries[indexR].startCE = start[0]; |
michael@0 | 192 | ucolIndirectBoundaries[indexR].startContCE = start[1]; |
michael@0 | 193 | if(end) { |
michael@0 | 194 | ucolIndirectBoundaries[indexR].limitCE = end[0]; |
michael@0 | 195 | ucolIndirectBoundaries[indexR].limitContCE = end[1]; |
michael@0 | 196 | } else { |
michael@0 | 197 | ucolIndirectBoundaries[indexR].limitCE = 0; |
michael@0 | 198 | ucolIndirectBoundaries[indexR].limitContCE = 0; |
michael@0 | 199 | } |
michael@0 | 200 | } |
michael@0 | 201 | |
michael@0 | 202 | |
michael@0 | 203 | static inline |
michael@0 | 204 | void syntaxError(const UChar* rules, |
michael@0 | 205 | int32_t pos, |
michael@0 | 206 | int32_t rulesLen, |
michael@0 | 207 | UParseError* parseError) |
michael@0 | 208 | { |
michael@0 | 209 | parseError->offset = pos; |
michael@0 | 210 | parseError->line = 0 ; /* we are not using line numbers */ |
michael@0 | 211 | |
michael@0 | 212 | // for pre-context |
michael@0 | 213 | int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); |
michael@0 | 214 | int32_t stop = pos; |
michael@0 | 215 | |
michael@0 | 216 | u_memcpy(parseError->preContext,rules+start,stop-start); |
michael@0 | 217 | //null terminate the buffer |
michael@0 | 218 | parseError->preContext[stop-start] = 0; |
michael@0 | 219 | |
michael@0 | 220 | //for post-context |
michael@0 | 221 | start = pos+1; |
michael@0 | 222 | stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : |
michael@0 | 223 | rulesLen; |
michael@0 | 224 | |
michael@0 | 225 | if(start < stop) { |
michael@0 | 226 | u_memcpy(parseError->postContext,rules+start,stop-start); |
michael@0 | 227 | //null terminate the buffer |
michael@0 | 228 | parseError->postContext[stop-start]= 0; |
michael@0 | 229 | } else { |
michael@0 | 230 | parseError->postContext[0] = 0; |
michael@0 | 231 | } |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | static |
michael@0 | 235 | void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { |
michael@0 | 236 | switch(attrib) { |
michael@0 | 237 | case UCOL_HIRAGANA_QUATERNARY_MODE: |
michael@0 | 238 | opts->hiraganaQ = value; |
michael@0 | 239 | break; |
michael@0 | 240 | case UCOL_FRENCH_COLLATION: |
michael@0 | 241 | opts->frenchCollation = value; |
michael@0 | 242 | break; |
michael@0 | 243 | case UCOL_ALTERNATE_HANDLING: |
michael@0 | 244 | opts->alternateHandling = value; |
michael@0 | 245 | break; |
michael@0 | 246 | case UCOL_CASE_FIRST: |
michael@0 | 247 | opts->caseFirst = value; |
michael@0 | 248 | break; |
michael@0 | 249 | case UCOL_CASE_LEVEL: |
michael@0 | 250 | opts->caseLevel = value; |
michael@0 | 251 | break; |
michael@0 | 252 | case UCOL_NORMALIZATION_MODE: |
michael@0 | 253 | opts->normalizationMode = value; |
michael@0 | 254 | break; |
michael@0 | 255 | case UCOL_STRENGTH: |
michael@0 | 256 | opts->strength = value; |
michael@0 | 257 | break; |
michael@0 | 258 | case UCOL_NUMERIC_COLLATION: |
michael@0 | 259 | opts->numericCollation = value; |
michael@0 | 260 | break; |
michael@0 | 261 | case UCOL_ATTRIBUTE_COUNT: |
michael@0 | 262 | default: |
michael@0 | 263 | break; |
michael@0 | 264 | } |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | #define UTOK_OPTION_COUNT 22 |
michael@0 | 268 | |
michael@0 | 269 | static UBool didInit = FALSE; |
michael@0 | 270 | /* we can be strict, or we can be lenient */ |
michael@0 | 271 | /* I'd surely be lenient with the option arguments */ |
michael@0 | 272 | /* maybe even with options */ |
michael@0 | 273 | U_STRING_DECL(suboption_00, "non-ignorable", 13); |
michael@0 | 274 | U_STRING_DECL(suboption_01, "shifted", 7); |
michael@0 | 275 | |
michael@0 | 276 | U_STRING_DECL(suboption_02, "lower", 5); |
michael@0 | 277 | U_STRING_DECL(suboption_03, "upper", 5); |
michael@0 | 278 | U_STRING_DECL(suboption_04, "off", 3); |
michael@0 | 279 | U_STRING_DECL(suboption_05, "on", 2); |
michael@0 | 280 | U_STRING_DECL(suboption_06, "1", 1); |
michael@0 | 281 | U_STRING_DECL(suboption_07, "2", 1); |
michael@0 | 282 | U_STRING_DECL(suboption_08, "3", 1); |
michael@0 | 283 | U_STRING_DECL(suboption_09, "4", 1); |
michael@0 | 284 | U_STRING_DECL(suboption_10, "I", 1); |
michael@0 | 285 | |
michael@0 | 286 | U_STRING_DECL(suboption_11, "primary", 7); |
michael@0 | 287 | U_STRING_DECL(suboption_12, "secondary", 9); |
michael@0 | 288 | U_STRING_DECL(suboption_13, "tertiary", 8); |
michael@0 | 289 | U_STRING_DECL(suboption_14, "variable", 8); |
michael@0 | 290 | U_STRING_DECL(suboption_15, "regular", 7); |
michael@0 | 291 | U_STRING_DECL(suboption_16, "implicit", 8); |
michael@0 | 292 | U_STRING_DECL(suboption_17, "trailing", 8); |
michael@0 | 293 | |
michael@0 | 294 | |
michael@0 | 295 | U_STRING_DECL(option_00, "undefined", 9); |
michael@0 | 296 | U_STRING_DECL(option_01, "rearrange", 9); |
michael@0 | 297 | U_STRING_DECL(option_02, "alternate", 9); |
michael@0 | 298 | U_STRING_DECL(option_03, "backwards", 9); |
michael@0 | 299 | U_STRING_DECL(option_04, "variable top", 12); |
michael@0 | 300 | U_STRING_DECL(option_05, "top", 3); |
michael@0 | 301 | U_STRING_DECL(option_06, "normalization", 13); |
michael@0 | 302 | U_STRING_DECL(option_07, "caseLevel", 9); |
michael@0 | 303 | U_STRING_DECL(option_08, "caseFirst", 9); |
michael@0 | 304 | U_STRING_DECL(option_09, "scriptOrder", 11); |
michael@0 | 305 | U_STRING_DECL(option_10, "charsetname", 11); |
michael@0 | 306 | U_STRING_DECL(option_11, "charset", 7); |
michael@0 | 307 | U_STRING_DECL(option_12, "before", 6); |
michael@0 | 308 | U_STRING_DECL(option_13, "hiraganaQ", 9); |
michael@0 | 309 | U_STRING_DECL(option_14, "strength", 8); |
michael@0 | 310 | U_STRING_DECL(option_15, "first", 5); |
michael@0 | 311 | U_STRING_DECL(option_16, "last", 4); |
michael@0 | 312 | U_STRING_DECL(option_17, "optimize", 8); |
michael@0 | 313 | U_STRING_DECL(option_18, "suppressContractions", 20); |
michael@0 | 314 | U_STRING_DECL(option_19, "numericOrdering", 15); |
michael@0 | 315 | U_STRING_DECL(option_20, "import", 6); |
michael@0 | 316 | U_STRING_DECL(option_21, "reorder", 7); |
michael@0 | 317 | |
michael@0 | 318 | /* |
michael@0 | 319 | [last variable] last variable value |
michael@0 | 320 | [last primary ignorable] largest CE for primary ignorable |
michael@0 | 321 | [last secondary ignorable] largest CE for secondary ignorable |
michael@0 | 322 | [last tertiary ignorable] largest CE for tertiary ignorable |
michael@0 | 323 | [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) |
michael@0 | 324 | */ |
michael@0 | 325 | |
michael@0 | 326 | |
michael@0 | 327 | static const ucolTokSuboption alternateSub[2] = { |
michael@0 | 328 | {suboption_00, 13, UCOL_NON_IGNORABLE}, |
michael@0 | 329 | {suboption_01, 7, UCOL_SHIFTED} |
michael@0 | 330 | }; |
michael@0 | 331 | |
michael@0 | 332 | static const ucolTokSuboption caseFirstSub[3] = { |
michael@0 | 333 | {suboption_02, 5, UCOL_LOWER_FIRST}, |
michael@0 | 334 | {suboption_03, 5, UCOL_UPPER_FIRST}, |
michael@0 | 335 | {suboption_04, 3, UCOL_OFF}, |
michael@0 | 336 | }; |
michael@0 | 337 | |
michael@0 | 338 | static const ucolTokSuboption onOffSub[2] = { |
michael@0 | 339 | {suboption_04, 3, UCOL_OFF}, |
michael@0 | 340 | {suboption_05, 2, UCOL_ON} |
michael@0 | 341 | }; |
michael@0 | 342 | |
michael@0 | 343 | static const ucolTokSuboption frenchSub[1] = { |
michael@0 | 344 | {suboption_07, 1, UCOL_ON} |
michael@0 | 345 | }; |
michael@0 | 346 | |
michael@0 | 347 | static const ucolTokSuboption beforeSub[3] = { |
michael@0 | 348 | {suboption_06, 1, UCOL_PRIMARY}, |
michael@0 | 349 | {suboption_07, 1, UCOL_SECONDARY}, |
michael@0 | 350 | {suboption_08, 1, UCOL_TERTIARY} |
michael@0 | 351 | }; |
michael@0 | 352 | |
michael@0 | 353 | static const ucolTokSuboption strengthSub[5] = { |
michael@0 | 354 | {suboption_06, 1, UCOL_PRIMARY}, |
michael@0 | 355 | {suboption_07, 1, UCOL_SECONDARY}, |
michael@0 | 356 | {suboption_08, 1, UCOL_TERTIARY}, |
michael@0 | 357 | {suboption_09, 1, UCOL_QUATERNARY}, |
michael@0 | 358 | {suboption_10, 1, UCOL_IDENTICAL}, |
michael@0 | 359 | }; |
michael@0 | 360 | |
michael@0 | 361 | static const ucolTokSuboption firstLastSub[7] = { |
michael@0 | 362 | {suboption_11, 7, UCOL_PRIMARY}, |
michael@0 | 363 | {suboption_12, 9, UCOL_PRIMARY}, |
michael@0 | 364 | {suboption_13, 8, UCOL_PRIMARY}, |
michael@0 | 365 | {suboption_14, 8, UCOL_PRIMARY}, |
michael@0 | 366 | {suboption_15, 7, UCOL_PRIMARY}, |
michael@0 | 367 | {suboption_16, 8, UCOL_PRIMARY}, |
michael@0 | 368 | {suboption_17, 8, UCOL_PRIMARY}, |
michael@0 | 369 | }; |
michael@0 | 370 | |
michael@0 | 371 | enum OptionNumber { |
michael@0 | 372 | OPTION_ALTERNATE_HANDLING = 0, |
michael@0 | 373 | OPTION_FRENCH_COLLATION, |
michael@0 | 374 | OPTION_CASE_LEVEL, |
michael@0 | 375 | OPTION_CASE_FIRST, |
michael@0 | 376 | OPTION_NORMALIZATION_MODE, |
michael@0 | 377 | OPTION_HIRAGANA_QUATERNARY, |
michael@0 | 378 | OPTION_STRENGTH, |
michael@0 | 379 | OPTION_NUMERIC_COLLATION, |
michael@0 | 380 | OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, |
michael@0 | 381 | OPTION_VARIABLE_TOP, |
michael@0 | 382 | OPTION_REARRANGE, |
michael@0 | 383 | OPTION_BEFORE, |
michael@0 | 384 | OPTION_TOP, |
michael@0 | 385 | OPTION_FIRST, |
michael@0 | 386 | OPTION_LAST, |
michael@0 | 387 | OPTION_OPTIMIZE, |
michael@0 | 388 | OPTION_SUPPRESS_CONTRACTIONS, |
michael@0 | 389 | OPTION_UNDEFINED, |
michael@0 | 390 | OPTION_SCRIPT_ORDER, |
michael@0 | 391 | OPTION_CHARSET_NAME, |
michael@0 | 392 | OPTION_CHARSET, |
michael@0 | 393 | OPTION_IMPORT, |
michael@0 | 394 | OPTION_SCRIPTREORDER |
michael@0 | 395 | } ; |
michael@0 | 396 | |
michael@0 | 397 | static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { |
michael@0 | 398 | /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ |
michael@0 | 399 | /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ |
michael@0 | 400 | /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ |
michael@0 | 401 | /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ |
michael@0 | 402 | /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ |
michael@0 | 403 | /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ |
michael@0 | 404 | /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ |
michael@0 | 405 | /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ |
michael@0 | 406 | /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
michael@0 | 407 | /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ |
michael@0 | 408 | /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ |
michael@0 | 409 | /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ |
michael@0 | 410 | /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ |
michael@0 | 411 | /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ |
michael@0 | 412 | /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ |
michael@0 | 413 | /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ |
michael@0 | 414 | /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ |
michael@0 | 415 | /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ |
michael@0 | 416 | /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ |
michael@0 | 417 | /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ |
michael@0 | 418 | /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ |
michael@0 | 419 | /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ |
michael@0 | 420 | }; |
michael@0 | 421 | |
michael@0 | 422 | static |
michael@0 | 423 | int32_t u_strncmpNoCase(const UChar *s1, |
michael@0 | 424 | const UChar *s2, |
michael@0 | 425 | int32_t n) |
michael@0 | 426 | { |
michael@0 | 427 | if(n > 0) { |
michael@0 | 428 | int32_t rc; |
michael@0 | 429 | for(;;) { |
michael@0 | 430 | rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); |
michael@0 | 431 | if(rc != 0 || *s1 == 0 || --n == 0) { |
michael@0 | 432 | return rc; |
michael@0 | 433 | } |
michael@0 | 434 | ++s1; |
michael@0 | 435 | ++s2; |
michael@0 | 436 | } |
michael@0 | 437 | } |
michael@0 | 438 | return 0; |
michael@0 | 439 | } |
michael@0 | 440 | |
michael@0 | 441 | static |
michael@0 | 442 | void ucol_uprv_tok_initData() { |
michael@0 | 443 | if(!didInit) { |
michael@0 | 444 | U_STRING_INIT(suboption_00, "non-ignorable", 13); |
michael@0 | 445 | U_STRING_INIT(suboption_01, "shifted", 7); |
michael@0 | 446 | |
michael@0 | 447 | U_STRING_INIT(suboption_02, "lower", 5); |
michael@0 | 448 | U_STRING_INIT(suboption_03, "upper", 5); |
michael@0 | 449 | U_STRING_INIT(suboption_04, "off", 3); |
michael@0 | 450 | U_STRING_INIT(suboption_05, "on", 2); |
michael@0 | 451 | |
michael@0 | 452 | U_STRING_INIT(suboption_06, "1", 1); |
michael@0 | 453 | U_STRING_INIT(suboption_07, "2", 1); |
michael@0 | 454 | U_STRING_INIT(suboption_08, "3", 1); |
michael@0 | 455 | U_STRING_INIT(suboption_09, "4", 1); |
michael@0 | 456 | U_STRING_INIT(suboption_10, "I", 1); |
michael@0 | 457 | |
michael@0 | 458 | U_STRING_INIT(suboption_11, "primary", 7); |
michael@0 | 459 | U_STRING_INIT(suboption_12, "secondary", 9); |
michael@0 | 460 | U_STRING_INIT(suboption_13, "tertiary", 8); |
michael@0 | 461 | U_STRING_INIT(suboption_14, "variable", 8); |
michael@0 | 462 | U_STRING_INIT(suboption_15, "regular", 7); |
michael@0 | 463 | U_STRING_INIT(suboption_16, "implicit", 8); |
michael@0 | 464 | U_STRING_INIT(suboption_17, "trailing", 8); |
michael@0 | 465 | |
michael@0 | 466 | |
michael@0 | 467 | U_STRING_INIT(option_00, "undefined", 9); |
michael@0 | 468 | U_STRING_INIT(option_01, "rearrange", 9); |
michael@0 | 469 | U_STRING_INIT(option_02, "alternate", 9); |
michael@0 | 470 | U_STRING_INIT(option_03, "backwards", 9); |
michael@0 | 471 | U_STRING_INIT(option_04, "variable top", 12); |
michael@0 | 472 | U_STRING_INIT(option_05, "top", 3); |
michael@0 | 473 | U_STRING_INIT(option_06, "normalization", 13); |
michael@0 | 474 | U_STRING_INIT(option_07, "caseLevel", 9); |
michael@0 | 475 | U_STRING_INIT(option_08, "caseFirst", 9); |
michael@0 | 476 | U_STRING_INIT(option_09, "scriptOrder", 11); |
michael@0 | 477 | U_STRING_INIT(option_10, "charsetname", 11); |
michael@0 | 478 | U_STRING_INIT(option_11, "charset", 7); |
michael@0 | 479 | U_STRING_INIT(option_12, "before", 6); |
michael@0 | 480 | U_STRING_INIT(option_13, "hiraganaQ", 9); |
michael@0 | 481 | U_STRING_INIT(option_14, "strength", 8); |
michael@0 | 482 | U_STRING_INIT(option_15, "first", 5); |
michael@0 | 483 | U_STRING_INIT(option_16, "last", 4); |
michael@0 | 484 | U_STRING_INIT(option_17, "optimize", 8); |
michael@0 | 485 | U_STRING_INIT(option_18, "suppressContractions", 20); |
michael@0 | 486 | U_STRING_INIT(option_19, "numericOrdering", 15); |
michael@0 | 487 | U_STRING_INIT(option_20, "import ", 6); |
michael@0 | 488 | U_STRING_INIT(option_21, "reorder", 7); |
michael@0 | 489 | didInit = TRUE; |
michael@0 | 490 | } |
michael@0 | 491 | } |
michael@0 | 492 | |
michael@0 | 493 | |
michael@0 | 494 | // This function reads basic options to set in the runtime collator |
michael@0 | 495 | // used by data driven tests. Should not support build time options |
michael@0 | 496 | U_CAPI const UChar * U_EXPORT2 |
michael@0 | 497 | ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
michael@0 | 498 | UColAttribute *attrib, UColAttributeValue *value, |
michael@0 | 499 | UErrorCode *status) |
michael@0 | 500 | { |
michael@0 | 501 | uint32_t i = 0; |
michael@0 | 502 | int32_t j=0; |
michael@0 | 503 | UBool foundOption = FALSE; |
michael@0 | 504 | const UChar *optionArg = NULL; |
michael@0 | 505 | |
michael@0 | 506 | ucol_uprv_tok_initData(); |
michael@0 | 507 | |
michael@0 | 508 | while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ |
michael@0 | 509 | start++; |
michael@0 | 510 | } |
michael@0 | 511 | if(start >= end) { |
michael@0 | 512 | return NULL; |
michael@0 | 513 | } |
michael@0 | 514 | /* skip opening '[' */ |
michael@0 | 515 | if(*start == 0x005b) { |
michael@0 | 516 | start++; |
michael@0 | 517 | } else { |
michael@0 | 518 | *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' |
michael@0 | 519 | return NULL; |
michael@0 | 520 | } |
michael@0 | 521 | |
michael@0 | 522 | while(i < UTOK_OPTION_COUNT) { |
michael@0 | 523 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
michael@0 | 524 | foundOption = TRUE; |
michael@0 | 525 | if(end - start > rulesOptions[i].optionLen) { |
michael@0 | 526 | optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ |
michael@0 | 527 | while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ |
michael@0 | 528 | optionArg++; |
michael@0 | 529 | } |
michael@0 | 530 | } |
michael@0 | 531 | break; |
michael@0 | 532 | } |
michael@0 | 533 | i++; |
michael@0 | 534 | } |
michael@0 | 535 | |
michael@0 | 536 | if(!foundOption) { |
michael@0 | 537 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 538 | return NULL; |
michael@0 | 539 | } |
michael@0 | 540 | |
michael@0 | 541 | if(optionArg) { |
michael@0 | 542 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
michael@0 | 543 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
michael@0 | 544 | //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
michael@0 | 545 | *attrib = rulesOptions[i].attr; |
michael@0 | 546 | *value = rulesOptions[i].subopts[j].attrVal; |
michael@0 | 547 | optionArg += rulesOptions[i].subopts[j].subLen; |
michael@0 | 548 | while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ |
michael@0 | 549 | optionArg++; |
michael@0 | 550 | } |
michael@0 | 551 | if(*optionArg == 0x005d) { |
michael@0 | 552 | optionArg++; |
michael@0 | 553 | return optionArg; |
michael@0 | 554 | } else { |
michael@0 | 555 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 556 | return NULL; |
michael@0 | 557 | } |
michael@0 | 558 | } |
michael@0 | 559 | } |
michael@0 | 560 | } |
michael@0 | 561 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 562 | return NULL; |
michael@0 | 563 | } |
michael@0 | 564 | |
michael@0 | 565 | static |
michael@0 | 566 | USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { |
michael@0 | 567 | while(*start != 0x005b) { /* advance while we find the first '[' */ |
michael@0 | 568 | start++; |
michael@0 | 569 | } |
michael@0 | 570 | // now we need to get a balanced set of '[]'. The problem is that a set can have |
michael@0 | 571 | // many, and *end point to the first closing '[' |
michael@0 | 572 | int32_t noOpenBraces = 1; |
michael@0 | 573 | int32_t current = 1; // skip the opening brace |
michael@0 | 574 | while(start+current < end && noOpenBraces != 0) { |
michael@0 | 575 | if(start[current] == 0x005b) { |
michael@0 | 576 | noOpenBraces++; |
michael@0 | 577 | } else if(start[current] == 0x005D) { // closing brace |
michael@0 | 578 | noOpenBraces--; |
michael@0 | 579 | } |
michael@0 | 580 | current++; |
michael@0 | 581 | } |
michael@0 | 582 | |
michael@0 | 583 | if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { |
michael@0 | 584 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 585 | return NULL; |
michael@0 | 586 | } |
michael@0 | 587 | return uset_openPattern(start, current, status); |
michael@0 | 588 | } |
michael@0 | 589 | |
michael@0 | 590 | /** |
michael@0 | 591 | * Reads an option and matches the option name with the predefined options. (Case-insensitive.) |
michael@0 | 592 | * @param start Pointer to the start UChar. |
michael@0 | 593 | * @param end Pointer to the last valid pointer beyond which the option will not extend. |
michael@0 | 594 | * @param optionArg Address of the pointer at which the options start (after the option name) |
michael@0 | 595 | * @return The index of the option, or -1 if the option is not valid. |
michael@0 | 596 | */ |
michael@0 | 597 | static |
michael@0 | 598 | int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { |
michael@0 | 599 | int32_t i = 0; |
michael@0 | 600 | ucol_uprv_tok_initData(); |
michael@0 | 601 | |
michael@0 | 602 | while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ |
michael@0 | 603 | start++; |
michael@0 | 604 | } |
michael@0 | 605 | while(i < UTOK_OPTION_COUNT) { |
michael@0 | 606 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
michael@0 | 607 | if(end - start > rulesOptions[i].optionLen) { |
michael@0 | 608 | *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ |
michael@0 | 609 | while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */ |
michael@0 | 610 | (*optionArg)++; |
michael@0 | 611 | } |
michael@0 | 612 | } |
michael@0 | 613 | break; |
michael@0 | 614 | } |
michael@0 | 615 | i++; |
michael@0 | 616 | } |
michael@0 | 617 | if(i == UTOK_OPTION_COUNT) { |
michael@0 | 618 | i = -1; // didn't find an option |
michael@0 | 619 | } |
michael@0 | 620 | return i; |
michael@0 | 621 | } |
michael@0 | 622 | |
michael@0 | 623 | |
michael@0 | 624 | static |
michael@0 | 625 | void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { |
michael@0 | 626 | int32_t codeCount = 0; |
michael@0 | 627 | int32_t codeIndex = 0; |
michael@0 | 628 | char conversion[64]; |
michael@0 | 629 | int32_t tokenLength = 0; |
michael@0 | 630 | const UChar* space; |
michael@0 | 631 | |
michael@0 | 632 | const UChar* current = src->current; |
michael@0 | 633 | const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); |
michael@0 | 634 | |
michael@0 | 635 | // eat leading whitespace |
michael@0 | 636 | while(current < end && u_isWhitespace(*current)) { |
michael@0 | 637 | current++; |
michael@0 | 638 | } |
michael@0 | 639 | |
michael@0 | 640 | while(current < end) { |
michael@0 | 641 | space = u_memchr(current, 0x0020, end - current); |
michael@0 | 642 | space = space == 0 ? end : space; |
michael@0 | 643 | tokenLength = space - current; |
michael@0 | 644 | if (tokenLength < 4) { |
michael@0 | 645 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 646 | return; |
michael@0 | 647 | } |
michael@0 | 648 | codeCount++; |
michael@0 | 649 | current += tokenLength; |
michael@0 | 650 | while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
michael@0 | 651 | ++current; |
michael@0 | 652 | } |
michael@0 | 653 | } |
michael@0 | 654 | |
michael@0 | 655 | if (codeCount == 0) { |
michael@0 | 656 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 657 | } |
michael@0 | 658 | |
michael@0 | 659 | src->reorderCodesLength = codeCount; |
michael@0 | 660 | src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); |
michael@0 | 661 | current = src->current; |
michael@0 | 662 | |
michael@0 | 663 | // eat leading whitespace |
michael@0 | 664 | while(current < end && u_isWhitespace(*current)) { |
michael@0 | 665 | current++; |
michael@0 | 666 | } |
michael@0 | 667 | |
michael@0 | 668 | while(current < end) { |
michael@0 | 669 | space = u_memchr(current, 0x0020, end - current); |
michael@0 | 670 | space = space == 0 ? end : space; |
michael@0 | 671 | tokenLength = space - current; |
michael@0 | 672 | if (tokenLength < 4) { |
michael@0 | 673 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 674 | return; |
michael@0 | 675 | } else { |
michael@0 | 676 | u_UCharsToChars(current, conversion, tokenLength); |
michael@0 | 677 | conversion[tokenLength] = '\0'; |
michael@0 | 678 | src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); |
michael@0 | 679 | if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
michael@0 | 680 | src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); |
michael@0 | 681 | } |
michael@0 | 682 | if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
michael@0 | 683 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 684 | } |
michael@0 | 685 | } |
michael@0 | 686 | codeIndex++; |
michael@0 | 687 | current += tokenLength; |
michael@0 | 688 | while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
michael@0 | 689 | ++current; |
michael@0 | 690 | } |
michael@0 | 691 | } |
michael@0 | 692 | } |
michael@0 | 693 | |
michael@0 | 694 | // reads and conforms to various options in rules |
michael@0 | 695 | // end is the position of the first closing ']' |
michael@0 | 696 | // However, some of the options take an UnicodeSet definition |
michael@0 | 697 | // which needs to duplicate the closing ']' |
michael@0 | 698 | // for example: '[copy [\uAC00-\uD7FF]]' |
michael@0 | 699 | // These options will move end to the second ']' and the |
michael@0 | 700 | // caller will set the current to it. |
michael@0 | 701 | static |
michael@0 | 702 | uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { |
michael@0 | 703 | const UChar* start = src->current; |
michael@0 | 704 | int32_t i = 0; |
michael@0 | 705 | int32_t j=0; |
michael@0 | 706 | const UChar *optionArg = NULL; |
michael@0 | 707 | |
michael@0 | 708 | uint8_t result = 0; |
michael@0 | 709 | |
michael@0 | 710 | start++; /*skip opening '['*/ |
michael@0 | 711 | i = ucol_uprv_tok_readOption(start, src->end, &optionArg); |
michael@0 | 712 | if(optionArg) { |
michael@0 | 713 | src->current = optionArg; |
michael@0 | 714 | } |
michael@0 | 715 | |
michael@0 | 716 | if(i < 0) { |
michael@0 | 717 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 718 | } else { |
michael@0 | 719 | int32_t noOpenBraces = 1; |
michael@0 | 720 | switch(i) { |
michael@0 | 721 | case OPTION_ALTERNATE_HANDLING: |
michael@0 | 722 | case OPTION_FRENCH_COLLATION: |
michael@0 | 723 | case OPTION_CASE_LEVEL: |
michael@0 | 724 | case OPTION_CASE_FIRST: |
michael@0 | 725 | case OPTION_NORMALIZATION_MODE: |
michael@0 | 726 | case OPTION_HIRAGANA_QUATERNARY: |
michael@0 | 727 | case OPTION_STRENGTH: |
michael@0 | 728 | case OPTION_NUMERIC_COLLATION: |
michael@0 | 729 | if(optionArg) { |
michael@0 | 730 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
michael@0 | 731 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
michael@0 | 732 | ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
michael@0 | 733 | result = UCOL_TOK_SUCCESS; |
michael@0 | 734 | } |
michael@0 | 735 | } |
michael@0 | 736 | } |
michael@0 | 737 | if(result == 0) { |
michael@0 | 738 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 739 | } |
michael@0 | 740 | break; |
michael@0 | 741 | case OPTION_VARIABLE_TOP: |
michael@0 | 742 | result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
michael@0 | 743 | break; |
michael@0 | 744 | case OPTION_REARRANGE: |
michael@0 | 745 | result = UCOL_TOK_SUCCESS; |
michael@0 | 746 | break; |
michael@0 | 747 | case OPTION_BEFORE: |
michael@0 | 748 | if(optionArg) { |
michael@0 | 749 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
michael@0 | 750 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
michael@0 | 751 | result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); |
michael@0 | 752 | } |
michael@0 | 753 | } |
michael@0 | 754 | } |
michael@0 | 755 | if(result == 0) { |
michael@0 | 756 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 757 | } |
michael@0 | 758 | break; |
michael@0 | 759 | case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ |
michael@0 | 760 | /* index to this array will be src->parsedToken.indirectIndex*/ |
michael@0 | 761 | src->parsedToken.indirectIndex = 0; |
michael@0 | 762 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; |
michael@0 | 763 | break; |
michael@0 | 764 | case OPTION_FIRST: |
michael@0 | 765 | case OPTION_LAST: /* first, last */ |
michael@0 | 766 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
michael@0 | 767 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
michael@0 | 768 | // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first |
michael@0 | 769 | // element of indirect boundaries is reserved for top. |
michael@0 | 770 | src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); |
michael@0 | 771 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; |
michael@0 | 772 | } |
michael@0 | 773 | } |
michael@0 | 774 | if(result == 0) { |
michael@0 | 775 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 776 | } |
michael@0 | 777 | break; |
michael@0 | 778 | case OPTION_OPTIMIZE: |
michael@0 | 779 | case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization |
michael@0 | 780 | // we need to move end here |
michael@0 | 781 | src->current++; // skip opening brace |
michael@0 | 782 | while(src->current < src->end && noOpenBraces != 0) { |
michael@0 | 783 | if(*src->current == 0x005b) { |
michael@0 | 784 | noOpenBraces++; |
michael@0 | 785 | } else if(*src->current == 0x005D) { // closing brace |
michael@0 | 786 | noOpenBraces--; |
michael@0 | 787 | } |
michael@0 | 788 | src->current++; |
michael@0 | 789 | } |
michael@0 | 790 | result = UCOL_TOK_SUCCESS; |
michael@0 | 791 | break; |
michael@0 | 792 | case OPTION_SCRIPTREORDER: |
michael@0 | 793 | ucol_tok_parseScriptReorder(src, status); |
michael@0 | 794 | break; |
michael@0 | 795 | default: |
michael@0 | 796 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 797 | break; |
michael@0 | 798 | } |
michael@0 | 799 | } |
michael@0 | 800 | src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); |
michael@0 | 801 | return result; |
michael@0 | 802 | } |
michael@0 | 803 | |
michael@0 | 804 | |
michael@0 | 805 | inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { |
michael@0 | 806 | if (stuff == NULL || len <= 0) { |
michael@0 | 807 | return; |
michael@0 | 808 | } |
michael@0 | 809 | UnicodeString tempStuff(FALSE, stuff, len); |
michael@0 | 810 | if(src->extraCurrent+len >= src->extraEnd) { |
michael@0 | 811 | /* reallocate */ |
michael@0 | 812 | if (stuff >= src->source && stuff <= src->end) { |
michael@0 | 813 | // Copy the "stuff" contents into tempStuff's own buffer. |
michael@0 | 814 | // UnicodeString is copy-on-write. |
michael@0 | 815 | if (len > 0) { |
michael@0 | 816 | tempStuff.setCharAt(0, tempStuff[0]); |
michael@0 | 817 | } else { |
michael@0 | 818 | tempStuff.remove(); |
michael@0 | 819 | } |
michael@0 | 820 | } |
michael@0 | 821 | UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); |
michael@0 | 822 | if(newSrc != NULL) { |
michael@0 | 823 | src->current = newSrc + (src->current - src->source); |
michael@0 | 824 | src->extraCurrent = newSrc + (src->extraCurrent - src->source); |
michael@0 | 825 | src->end = newSrc + (src->end - src->source); |
michael@0 | 826 | src->extraEnd = newSrc + (src->extraEnd-src->source)*2; |
michael@0 | 827 | src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); |
michael@0 | 828 | src->source = newSrc; |
michael@0 | 829 | } else { |
michael@0 | 830 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 831 | return; |
michael@0 | 832 | } |
michael@0 | 833 | } |
michael@0 | 834 | if(len == 1) { |
michael@0 | 835 | *src->extraCurrent++ = tempStuff[0]; |
michael@0 | 836 | } else { |
michael@0 | 837 | u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); |
michael@0 | 838 | src->extraCurrent += len; |
michael@0 | 839 | } |
michael@0 | 840 | } |
michael@0 | 841 | |
michael@0 | 842 | inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { |
michael@0 | 843 | /* |
michael@0 | 844 | top = TRUE; |
michael@0 | 845 | */ |
michael@0 | 846 | UChar buff[5]; |
michael@0 | 847 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 848 | buff[0] = 0xFFFE; |
michael@0 | 849 | buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); |
michael@0 | 850 | buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); |
michael@0 | 851 | if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { |
michael@0 | 852 | src->parsedToken.charsLen = 3; |
michael@0 | 853 | ucol_tok_addToExtraCurrent(src, buff, 3, status); |
michael@0 | 854 | } else { |
michael@0 | 855 | buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); |
michael@0 | 856 | buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); |
michael@0 | 857 | src->parsedToken.charsLen = 5; |
michael@0 | 858 | ucol_tok_addToExtraCurrent(src, buff, 5, status); |
michael@0 | 859 | } |
michael@0 | 860 | return TRUE; |
michael@0 | 861 | } |
michael@0 | 862 | |
michael@0 | 863 | static UBool isCharNewLine(UChar c){ |
michael@0 | 864 | switch(c){ |
michael@0 | 865 | case 0x000A: /* LF */ |
michael@0 | 866 | case 0x000D: /* CR */ |
michael@0 | 867 | case 0x000C: /* FF */ |
michael@0 | 868 | case 0x0085: /* NEL */ |
michael@0 | 869 | case 0x2028: /* LS */ |
michael@0 | 870 | case 0x2029: /* PS */ |
michael@0 | 871 | return TRUE; |
michael@0 | 872 | default: |
michael@0 | 873 | return FALSE; |
michael@0 | 874 | } |
michael@0 | 875 | } |
michael@0 | 876 | |
michael@0 | 877 | /* |
michael@0 | 878 | * This function is called several times when a range is processed. Each time, the next code point |
michael@0 | 879 | * is processed. |
michael@0 | 880 | * The following variables must be set before calling this function: |
michael@0 | 881 | * src->currentRangeCp: The current code point to process. |
michael@0 | 882 | * src->lastRangeCp: The last code point in the range. |
michael@0 | 883 | * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. |
michael@0 | 884 | */ |
michael@0 | 885 | static const UChar* |
michael@0 | 886 | ucol_tok_processNextCodePointInRange(UColTokenParser *src, |
michael@0 | 887 | UErrorCode *status) |
michael@0 | 888 | { |
michael@0 | 889 | // Append current code point to source |
michael@0 | 890 | UChar buff[U16_MAX_LENGTH]; |
michael@0 | 891 | uint32_t i = 0; |
michael@0 | 892 | |
michael@0 | 893 | uint32_t nChars = U16_LENGTH(src->currentRangeCp); |
michael@0 | 894 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 895 | src->parsedToken.charsLen = nChars; |
michael@0 | 896 | |
michael@0 | 897 | U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); |
michael@0 | 898 | ucol_tok_addToExtraCurrent(src, buff, nChars, status); |
michael@0 | 899 | |
michael@0 | 900 | ++src->currentRangeCp; |
michael@0 | 901 | if (src->currentRangeCp > src->lastRangeCp) { |
michael@0 | 902 | src->inRange = FALSE; |
michael@0 | 903 | |
michael@0 | 904 | if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
michael@0 | 905 | src->isStarred = FALSE; |
michael@0 | 906 | } |
michael@0 | 907 | } else { |
michael@0 | 908 | src->previousCp = src->currentRangeCp; |
michael@0 | 909 | } |
michael@0 | 910 | return src->current; |
michael@0 | 911 | } |
michael@0 | 912 | |
michael@0 | 913 | /* |
michael@0 | 914 | * This function is called several times when a starred list is processed. Each time, the next code point |
michael@0 | 915 | * in the list is processed. |
michael@0 | 916 | * The following variables must be set before calling this function: |
michael@0 | 917 | * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. |
michael@0 | 918 | * src->lastStarredCharIndex: Index to the last character in the list. |
michael@0 | 919 | * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. |
michael@0 | 920 | */ |
michael@0 | 921 | static const UChar* |
michael@0 | 922 | ucol_tok_processNextTokenInStarredList(UColTokenParser *src) |
michael@0 | 923 | { |
michael@0 | 924 | // Extract the characters corresponding to the next code point. |
michael@0 | 925 | UChar32 cp; |
michael@0 | 926 | src->parsedToken.charsOffset = src->currentStarredCharIndex; |
michael@0 | 927 | int32_t prev = src->currentStarredCharIndex; |
michael@0 | 928 | U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); |
michael@0 | 929 | src->parsedToken.charsLen = src->currentStarredCharIndex - prev; |
michael@0 | 930 | |
michael@0 | 931 | // When we are done parsing the starred string, turn the flag off so that |
michael@0 | 932 | // the normal processing is restored. |
michael@0 | 933 | if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
michael@0 | 934 | src->isStarred = FALSE; |
michael@0 | 935 | } |
michael@0 | 936 | src->previousCp = cp; |
michael@0 | 937 | return src->current; |
michael@0 | 938 | } |
michael@0 | 939 | |
michael@0 | 940 | /* |
michael@0 | 941 | * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. |
michael@0 | 942 | * |
michael@0 | 943 | * This routine parses and separates almost all tokens. The following are the syntax characters recognized. |
michael@0 | 944 | * # : Comment character |
michael@0 | 945 | * & : Reset operator |
michael@0 | 946 | * = : Equality |
michael@0 | 947 | * < : Primary collation |
michael@0 | 948 | * << : Secondary collation |
michael@0 | 949 | * <<< : Tertiary collation |
michael@0 | 950 | * ; : Secondary collation |
michael@0 | 951 | * , : Tertiary collation |
michael@0 | 952 | * / : Expansions |
michael@0 | 953 | * | : Prefix |
michael@0 | 954 | * - : Range |
michael@0 | 955 | |
michael@0 | 956 | * ! : Java Thai modifier, ignored |
michael@0 | 957 | * @ : French only |
michael@0 | 958 | |
michael@0 | 959 | * [] : Options |
michael@0 | 960 | * '' : Quotes |
michael@0 | 961 | * |
michael@0 | 962 | * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz |
michael@0 | 963 | * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. |
michael@0 | 964 | * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", |
michael@0 | 965 | * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous |
michael@0 | 966 | * character returned as cached so that the calling program can do further splitting. |
michael@0 | 967 | */ |
michael@0 | 968 | static const UChar* |
michael@0 | 969 | ucol_tok_parseNextTokenInternal(UColTokenParser *src, |
michael@0 | 970 | UBool startOfRules, |
michael@0 | 971 | UParseError *parseError, |
michael@0 | 972 | UErrorCode *status) |
michael@0 | 973 | { |
michael@0 | 974 | UBool variableTop = FALSE; |
michael@0 | 975 | UBool top = FALSE; |
michael@0 | 976 | UBool inChars = TRUE; |
michael@0 | 977 | UBool inQuote = FALSE; |
michael@0 | 978 | UBool wasInQuote = FALSE; |
michael@0 | 979 | uint8_t before = 0; |
michael@0 | 980 | UBool isEscaped = FALSE; |
michael@0 | 981 | |
michael@0 | 982 | // TODO: replace these variables with src->parsedToken counterparts |
michael@0 | 983 | // no need to use them anymore since we have src->parsedToken. |
michael@0 | 984 | // Ideally, token parser would be a nice class... Once, when I have |
michael@0 | 985 | // more time (around 2020 probably). |
michael@0 | 986 | uint32_t newExtensionLen = 0; |
michael@0 | 987 | uint32_t extensionOffset = 0; |
michael@0 | 988 | uint32_t newStrength = UCOL_TOK_UNSET; |
michael@0 | 989 | UChar buff[10]; |
michael@0 | 990 | |
michael@0 | 991 | src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; |
michael@0 | 992 | src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; |
michael@0 | 993 | src->parsedToken.indirectIndex = 0; |
michael@0 | 994 | |
michael@0 | 995 | while (src->current < src->end) { |
michael@0 | 996 | UChar ch = *(src->current); |
michael@0 | 997 | |
michael@0 | 998 | if (inQuote) { |
michael@0 | 999 | if (ch == 0x0027/*'\''*/) { |
michael@0 | 1000 | inQuote = FALSE; |
michael@0 | 1001 | } else { |
michael@0 | 1002 | if ((src->parsedToken.charsLen == 0) || inChars) { |
michael@0 | 1003 | if(src->parsedToken.charsLen == 0) { |
michael@0 | 1004 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1005 | } |
michael@0 | 1006 | src->parsedToken.charsLen++; |
michael@0 | 1007 | } else { |
michael@0 | 1008 | if(newExtensionLen == 0) { |
michael@0 | 1009 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1010 | } |
michael@0 | 1011 | newExtensionLen++; |
michael@0 | 1012 | } |
michael@0 | 1013 | } |
michael@0 | 1014 | }else if(isEscaped){ |
michael@0 | 1015 | isEscaped =FALSE; |
michael@0 | 1016 | if (newStrength == UCOL_TOK_UNSET) { |
michael@0 | 1017 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1018 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1019 | DBG_FORMAT_ERROR |
michael@0 | 1020 | return NULL; |
michael@0 | 1021 | // enabling rules to start with non-tokens a < b |
michael@0 | 1022 | // newStrength = UCOL_TOK_RESET; |
michael@0 | 1023 | } |
michael@0 | 1024 | if(ch != 0x0000 && src->current != src->end) { |
michael@0 | 1025 | if (inChars) { |
michael@0 | 1026 | if(src->parsedToken.charsLen == 0) { |
michael@0 | 1027 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
michael@0 | 1028 | } |
michael@0 | 1029 | src->parsedToken.charsLen++; |
michael@0 | 1030 | } else { |
michael@0 | 1031 | if(newExtensionLen == 0) { |
michael@0 | 1032 | extensionOffset = (uint32_t)(src->current - src->source); |
michael@0 | 1033 | } |
michael@0 | 1034 | newExtensionLen++; |
michael@0 | 1035 | } |
michael@0 | 1036 | } |
michael@0 | 1037 | }else { |
michael@0 | 1038 | if(!PatternProps::isWhiteSpace(ch)) { |
michael@0 | 1039 | /* Sets the strength for this entry */ |
michael@0 | 1040 | switch (ch) { |
michael@0 | 1041 | case 0x003D/*'='*/ : |
michael@0 | 1042 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1043 | goto EndOfLoop; |
michael@0 | 1044 | } |
michael@0 | 1045 | |
michael@0 | 1046 | /* if we start with strength, we'll reset to top */ |
michael@0 | 1047 | if(startOfRules == TRUE) { |
michael@0 | 1048 | src->parsedToken.indirectIndex = 5; |
michael@0 | 1049 | top = ucol_tok_doSetTop(src, status); |
michael@0 | 1050 | newStrength = UCOL_TOK_RESET; |
michael@0 | 1051 | goto EndOfLoop; |
michael@0 | 1052 | } |
michael@0 | 1053 | newStrength = UCOL_IDENTICAL; |
michael@0 | 1054 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
michael@0 | 1055 | src->current++; |
michael@0 | 1056 | src->isStarred = TRUE; |
michael@0 | 1057 | } |
michael@0 | 1058 | break; |
michael@0 | 1059 | |
michael@0 | 1060 | case 0x002C/*','*/: |
michael@0 | 1061 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1062 | goto EndOfLoop; |
michael@0 | 1063 | } |
michael@0 | 1064 | |
michael@0 | 1065 | /* if we start with strength, we'll reset to top */ |
michael@0 | 1066 | if(startOfRules == TRUE) { |
michael@0 | 1067 | src->parsedToken.indirectIndex = 5; |
michael@0 | 1068 | top = ucol_tok_doSetTop(src, status); |
michael@0 | 1069 | newStrength = UCOL_TOK_RESET; |
michael@0 | 1070 | goto EndOfLoop; |
michael@0 | 1071 | } |
michael@0 | 1072 | newStrength = UCOL_TERTIARY; |
michael@0 | 1073 | break; |
michael@0 | 1074 | |
michael@0 | 1075 | case 0x003B/*';'*/: |
michael@0 | 1076 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1077 | goto EndOfLoop; |
michael@0 | 1078 | } |
michael@0 | 1079 | |
michael@0 | 1080 | /* if we start with strength, we'll reset to top */ |
michael@0 | 1081 | if(startOfRules == TRUE) { |
michael@0 | 1082 | src->parsedToken.indirectIndex = 5; |
michael@0 | 1083 | top = ucol_tok_doSetTop(src, status); |
michael@0 | 1084 | newStrength = UCOL_TOK_RESET; |
michael@0 | 1085 | goto EndOfLoop; |
michael@0 | 1086 | } |
michael@0 | 1087 | newStrength = UCOL_SECONDARY; |
michael@0 | 1088 | break; |
michael@0 | 1089 | |
michael@0 | 1090 | case 0x003C/*'<'*/: |
michael@0 | 1091 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1092 | goto EndOfLoop; |
michael@0 | 1093 | } |
michael@0 | 1094 | |
michael@0 | 1095 | /* if we start with strength, we'll reset to top */ |
michael@0 | 1096 | if(startOfRules == TRUE) { |
michael@0 | 1097 | src->parsedToken.indirectIndex = 5; |
michael@0 | 1098 | top = ucol_tok_doSetTop(src, status); |
michael@0 | 1099 | newStrength = UCOL_TOK_RESET; |
michael@0 | 1100 | goto EndOfLoop; |
michael@0 | 1101 | } |
michael@0 | 1102 | /* before this, do a scan to verify whether this is */ |
michael@0 | 1103 | /* another strength */ |
michael@0 | 1104 | if(*(src->current+1) == 0x003C) { |
michael@0 | 1105 | src->current++; |
michael@0 | 1106 | if(*(src->current+1) == 0x003C) { |
michael@0 | 1107 | src->current++; /* three in a row! */ |
michael@0 | 1108 | newStrength = UCOL_TERTIARY; |
michael@0 | 1109 | } else { /* two in a row */ |
michael@0 | 1110 | newStrength = UCOL_SECONDARY; |
michael@0 | 1111 | } |
michael@0 | 1112 | } else { /* just one */ |
michael@0 | 1113 | newStrength = UCOL_PRIMARY; |
michael@0 | 1114 | } |
michael@0 | 1115 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
michael@0 | 1116 | src->current++; |
michael@0 | 1117 | src->isStarred = TRUE; |
michael@0 | 1118 | } |
michael@0 | 1119 | break; |
michael@0 | 1120 | |
michael@0 | 1121 | case 0x0026/*'&'*/: |
michael@0 | 1122 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1123 | /**/ |
michael@0 | 1124 | goto EndOfLoop; |
michael@0 | 1125 | } |
michael@0 | 1126 | |
michael@0 | 1127 | newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
michael@0 | 1128 | break; |
michael@0 | 1129 | |
michael@0 | 1130 | case 0x005b/*'['*/: |
michael@0 | 1131 | /* options - read an option, analyze it */ |
michael@0 | 1132 | if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { |
michael@0 | 1133 | uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); |
michael@0 | 1134 | if(U_SUCCESS(*status)) { |
michael@0 | 1135 | if(result & UCOL_TOK_TOP) { |
michael@0 | 1136 | if(newStrength == UCOL_TOK_RESET) { |
michael@0 | 1137 | top = ucol_tok_doSetTop(src, status); |
michael@0 | 1138 | if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' |
michael@0 | 1139 | src->parsedToken.charsLen+=2; |
michael@0 | 1140 | buff[0] = 0x002d; |
michael@0 | 1141 | buff[1] = before; |
michael@0 | 1142 | ucol_tok_addToExtraCurrent(src, buff, 2, status); |
michael@0 | 1143 | } |
michael@0 | 1144 | |
michael@0 | 1145 | src->current++; |
michael@0 | 1146 | goto EndOfLoop; |
michael@0 | 1147 | } else { |
michael@0 | 1148 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1149 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1150 | DBG_FORMAT_ERROR |
michael@0 | 1151 | } |
michael@0 | 1152 | } else if(result & UCOL_TOK_VARIABLE_TOP) { |
michael@0 | 1153 | if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1154 | variableTop = TRUE; |
michael@0 | 1155 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1156 | src->parsedToken.charsLen = 1; |
michael@0 | 1157 | buff[0] = 0xFFFF; |
michael@0 | 1158 | ucol_tok_addToExtraCurrent(src, buff, 1, status); |
michael@0 | 1159 | src->current++; |
michael@0 | 1160 | goto EndOfLoop; |
michael@0 | 1161 | } else { |
michael@0 | 1162 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1163 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1164 | DBG_FORMAT_ERROR |
michael@0 | 1165 | } |
michael@0 | 1166 | } else if (result & UCOL_TOK_BEFORE){ |
michael@0 | 1167 | if(newStrength == UCOL_TOK_RESET) { |
michael@0 | 1168 | before = result & UCOL_TOK_BEFORE; |
michael@0 | 1169 | } else { |
michael@0 | 1170 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1171 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1172 | DBG_FORMAT_ERROR |
michael@0 | 1173 | } |
michael@0 | 1174 | } |
michael@0 | 1175 | } else { |
michael@0 | 1176 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1177 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1178 | DBG_FORMAT_ERROR |
michael@0 | 1179 | return NULL; |
michael@0 | 1180 | } |
michael@0 | 1181 | } |
michael@0 | 1182 | break; |
michael@0 | 1183 | case 0x0021/*! skip java thai modifier reordering*/: |
michael@0 | 1184 | break; |
michael@0 | 1185 | case 0x002F/*'/'*/: |
michael@0 | 1186 | wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ |
michael@0 | 1187 | inChars = FALSE; /* we're now processing expansion */ |
michael@0 | 1188 | break; |
michael@0 | 1189 | case 0x005C /* back slash for escaped chars */: |
michael@0 | 1190 | isEscaped = TRUE; |
michael@0 | 1191 | break; |
michael@0 | 1192 | /* found a quote, we're gonna start copying */ |
michael@0 | 1193 | case 0x0027/*'\''*/: |
michael@0 | 1194 | if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ |
michael@0 | 1195 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1196 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1197 | DBG_FORMAT_ERROR |
michael@0 | 1198 | return NULL; |
michael@0 | 1199 | // enabling rules to start with a non-token character a < b |
michael@0 | 1200 | // newStrength = UCOL_TOK_RESET; |
michael@0 | 1201 | } |
michael@0 | 1202 | |
michael@0 | 1203 | inQuote = TRUE; |
michael@0 | 1204 | |
michael@0 | 1205 | if(inChars) { /* we're doing characters */ |
michael@0 | 1206 | if(wasInQuote == FALSE) { |
michael@0 | 1207 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1208 | } |
michael@0 | 1209 | if (src->parsedToken.charsLen != 0) { |
michael@0 | 1210 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
michael@0 | 1211 | } |
michael@0 | 1212 | src->parsedToken.charsLen++; |
michael@0 | 1213 | } else { /* we're doing an expansion */ |
michael@0 | 1214 | if(wasInQuote == FALSE) { |
michael@0 | 1215 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1216 | } |
michael@0 | 1217 | if (newExtensionLen != 0) { |
michael@0 | 1218 | ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); |
michael@0 | 1219 | } |
michael@0 | 1220 | newExtensionLen++; |
michael@0 | 1221 | } |
michael@0 | 1222 | |
michael@0 | 1223 | wasInQuote = TRUE; |
michael@0 | 1224 | |
michael@0 | 1225 | ch = *(++(src->current)); |
michael@0 | 1226 | if(ch == 0x0027) { /* copy the double quote */ |
michael@0 | 1227 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
michael@0 | 1228 | inQuote = FALSE; |
michael@0 | 1229 | } |
michael@0 | 1230 | break; |
michael@0 | 1231 | |
michael@0 | 1232 | /* '@' is french only if the strength is not currently set */ |
michael@0 | 1233 | /* if it is, it's just a regular character in collation rules */ |
michael@0 | 1234 | case 0x0040/*'@'*/: |
michael@0 | 1235 | if (newStrength == UCOL_TOK_UNSET) { |
michael@0 | 1236 | src->opts->frenchCollation = UCOL_ON; |
michael@0 | 1237 | break; |
michael@0 | 1238 | } |
michael@0 | 1239 | |
michael@0 | 1240 | case 0x007C /*|*/: /* this means we have actually been reading prefix part */ |
michael@0 | 1241 | // we want to store read characters to the prefix part and continue reading |
michael@0 | 1242 | // the characters (proper way would be to restart reading the chars, but in |
michael@0 | 1243 | // that case we would have to complicate the token hasher, which I do not |
michael@0 | 1244 | // intend to play with. Instead, we will do prefixes when prefixes are due |
michael@0 | 1245 | // (before adding the elements). |
michael@0 | 1246 | src->parsedToken.prefixOffset = src->parsedToken.charsOffset; |
michael@0 | 1247 | src->parsedToken.prefixLen = src->parsedToken.charsLen; |
michael@0 | 1248 | |
michael@0 | 1249 | if(inChars) { /* we're doing characters */ |
michael@0 | 1250 | if(wasInQuote == FALSE) { |
michael@0 | 1251 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1252 | } |
michael@0 | 1253 | if (src->parsedToken.charsLen != 0) { |
michael@0 | 1254 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
michael@0 | 1255 | } |
michael@0 | 1256 | src->parsedToken.charsLen++; |
michael@0 | 1257 | } |
michael@0 | 1258 | |
michael@0 | 1259 | wasInQuote = TRUE; |
michael@0 | 1260 | |
michael@0 | 1261 | do { |
michael@0 | 1262 | ch = *(++(src->current)); |
michael@0 | 1263 | // skip whitespace between '|' and the character |
michael@0 | 1264 | } while (PatternProps::isWhiteSpace(ch)); |
michael@0 | 1265 | break; |
michael@0 | 1266 | |
michael@0 | 1267 | //charsOffset = 0; |
michael@0 | 1268 | //newCharsLen = 0; |
michael@0 | 1269 | //break; // We want to store the whole prefix/character sequence. If we break |
michael@0 | 1270 | // the '|' is going to get lost. |
michael@0 | 1271 | |
michael@0 | 1272 | case 0x002D /*-*/: /* A range. */ |
michael@0 | 1273 | if (newStrength != UCOL_TOK_UNSET) { |
michael@0 | 1274 | // While processing the pending token, the isStarred field |
michael@0 | 1275 | // is reset, so it needs to be saved for the next |
michael@0 | 1276 | // invocation. |
michael@0 | 1277 | src->savedIsStarred = src->isStarred; |
michael@0 | 1278 | goto EndOfLoop; |
michael@0 | 1279 | } |
michael@0 | 1280 | src->isStarred = src->savedIsStarred; |
michael@0 | 1281 | |
michael@0 | 1282 | // Ranges are valid only in starred tokens. |
michael@0 | 1283 | if (!src->isStarred) { |
michael@0 | 1284 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1285 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1286 | DBG_FORMAT_ERROR |
michael@0 | 1287 | return NULL; |
michael@0 | 1288 | } |
michael@0 | 1289 | newStrength = src->parsedToken.strength; |
michael@0 | 1290 | src->inRange = TRUE; |
michael@0 | 1291 | break; |
michael@0 | 1292 | |
michael@0 | 1293 | case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ |
michael@0 | 1294 | do { |
michael@0 | 1295 | ch = *(++(src->current)); |
michael@0 | 1296 | } while (!isCharNewLine(ch)); |
michael@0 | 1297 | |
michael@0 | 1298 | break; |
michael@0 | 1299 | default: |
michael@0 | 1300 | if (newStrength == UCOL_TOK_UNSET) { |
michael@0 | 1301 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1302 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1303 | DBG_FORMAT_ERROR |
michael@0 | 1304 | return NULL; |
michael@0 | 1305 | } |
michael@0 | 1306 | |
michael@0 | 1307 | if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
michael@0 | 1308 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1309 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1310 | DBG_FORMAT_ERROR |
michael@0 | 1311 | return NULL; |
michael@0 | 1312 | } |
michael@0 | 1313 | |
michael@0 | 1314 | if(ch == 0x0000 && src->current+1 == src->end) { |
michael@0 | 1315 | break; |
michael@0 | 1316 | } |
michael@0 | 1317 | |
michael@0 | 1318 | if (inChars) { |
michael@0 | 1319 | if(src->parsedToken.charsLen == 0) { |
michael@0 | 1320 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
michael@0 | 1321 | } |
michael@0 | 1322 | src->parsedToken.charsLen++; |
michael@0 | 1323 | } else { |
michael@0 | 1324 | if(newExtensionLen == 0) { |
michael@0 | 1325 | extensionOffset = (uint32_t)(src->current - src->source); |
michael@0 | 1326 | } |
michael@0 | 1327 | newExtensionLen++; |
michael@0 | 1328 | } |
michael@0 | 1329 | |
michael@0 | 1330 | break; |
michael@0 | 1331 | } |
michael@0 | 1332 | } |
michael@0 | 1333 | } |
michael@0 | 1334 | |
michael@0 | 1335 | if(wasInQuote) { |
michael@0 | 1336 | if(ch != 0x27) { |
michael@0 | 1337 | if(inQuote || !PatternProps::isWhiteSpace(ch)) { |
michael@0 | 1338 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
michael@0 | 1339 | } |
michael@0 | 1340 | } |
michael@0 | 1341 | } |
michael@0 | 1342 | |
michael@0 | 1343 | src->current++; |
michael@0 | 1344 | } |
michael@0 | 1345 | |
michael@0 | 1346 | EndOfLoop: |
michael@0 | 1347 | wasInQuote = FALSE; |
michael@0 | 1348 | if (newStrength == UCOL_TOK_UNSET) { |
michael@0 | 1349 | return NULL; |
michael@0 | 1350 | } |
michael@0 | 1351 | |
michael@0 | 1352 | if (src->parsedToken.charsLen == 0 && top == FALSE) { |
michael@0 | 1353 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
michael@0 | 1354 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1355 | DBG_FORMAT_ERROR |
michael@0 | 1356 | return NULL; |
michael@0 | 1357 | } |
michael@0 | 1358 | |
michael@0 | 1359 | src->parsedToken.strength = newStrength; |
michael@0 | 1360 | src->parsedToken.extensionOffset = extensionOffset; |
michael@0 | 1361 | src->parsedToken.extensionLen = newExtensionLen; |
michael@0 | 1362 | src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; |
michael@0 | 1363 | |
michael@0 | 1364 | return src->current; |
michael@0 | 1365 | } |
michael@0 | 1366 | |
michael@0 | 1367 | /* |
michael@0 | 1368 | * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. |
michael@0 | 1369 | * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. |
michael@0 | 1370 | * |
michael@0 | 1371 | * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: |
michael@0 | 1372 | * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates |
michael@0 | 1373 | * it to separate tokens and returns one by one. In order to do that, the necessary states are |
michael@0 | 1374 | * cached as member variables of the token parser. |
michael@0 | 1375 | * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the |
michael@0 | 1376 | * starting character as a single list token (which is separated into individual characters here) |
michael@0 | 1377 | * and as another list token starting with the last character in the range. Before expanding it |
michael@0 | 1378 | * as a list of tokens, this function expands the range by filling the intermediate characters and |
michael@0 | 1379 | * returns them one by one as separate tokens. |
michael@0 | 1380 | * Necessary checks are done for invalid combinations. |
michael@0 | 1381 | */ |
michael@0 | 1382 | U_CAPI const UChar* U_EXPORT2 |
michael@0 | 1383 | ucol_tok_parseNextToken(UColTokenParser *src, |
michael@0 | 1384 | UBool startOfRules, |
michael@0 | 1385 | UParseError *parseError, |
michael@0 | 1386 | UErrorCode *status) |
michael@0 | 1387 | { |
michael@0 | 1388 | const UChar *nextToken; |
michael@0 | 1389 | |
michael@0 | 1390 | if (src->inRange) { |
michael@0 | 1391 | // We are not done processing a range. Continue it. |
michael@0 | 1392 | return ucol_tok_processNextCodePointInRange(src, status); |
michael@0 | 1393 | } else if (src->isStarred) { |
michael@0 | 1394 | // We are not done processing a starred token. Continue it. |
michael@0 | 1395 | return ucol_tok_processNextTokenInStarredList(src); |
michael@0 | 1396 | } |
michael@0 | 1397 | |
michael@0 | 1398 | // Get the next token. |
michael@0 | 1399 | nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); |
michael@0 | 1400 | |
michael@0 | 1401 | if (nextToken == NULL) { |
michael@0 | 1402 | return NULL; |
michael@0 | 1403 | } |
michael@0 | 1404 | |
michael@0 | 1405 | if (src->inRange) { |
michael@0 | 1406 | // A new range has started. |
michael@0 | 1407 | // Check whether it is a chain of ranges with more than one hyphen. |
michael@0 | 1408 | if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { |
michael@0 | 1409 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1410 | syntaxError(src->source,src->parsedToken.charsOffset-1, |
michael@0 | 1411 | src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); |
michael@0 | 1412 | DBG_FORMAT_ERROR |
michael@0 | 1413 | return NULL; |
michael@0 | 1414 | } |
michael@0 | 1415 | |
michael@0 | 1416 | // The current token indicates the second code point of the range. |
michael@0 | 1417 | // Process just that, and then proceed with the star. |
michael@0 | 1418 | src->currentStarredCharIndex = src->parsedToken.charsOffset; |
michael@0 | 1419 | U16_NEXT(src->source, src->currentStarredCharIndex, |
michael@0 | 1420 | (uint32_t)(src->end - src->source), src->lastRangeCp); |
michael@0 | 1421 | if (src->lastRangeCp <= src->previousCp) { |
michael@0 | 1422 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1423 | syntaxError(src->source,src->parsedToken.charsOffset-1, |
michael@0 | 1424 | src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); |
michael@0 | 1425 | DBG_FORMAT_ERROR |
michael@0 | 1426 | return NULL; |
michael@0 | 1427 | } |
michael@0 | 1428 | |
michael@0 | 1429 | // Set current range code point to process the range loop |
michael@0 | 1430 | src->currentRangeCp = src->previousCp + 1; |
michael@0 | 1431 | |
michael@0 | 1432 | src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; |
michael@0 | 1433 | |
michael@0 | 1434 | return ucol_tok_processNextCodePointInRange(src, status); |
michael@0 | 1435 | } else if (src->isStarred) { |
michael@0 | 1436 | // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that |
michael@0 | 1437 | // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be |
michael@0 | 1438 | // separated into several tokens and returned. |
michael@0 | 1439 | src->currentStarredCharIndex = src->parsedToken.charsOffset; |
michael@0 | 1440 | src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; |
michael@0 | 1441 | |
michael@0 | 1442 | return ucol_tok_processNextTokenInStarredList(src); |
michael@0 | 1443 | } else { |
michael@0 | 1444 | // Set previous codepoint |
michael@0 | 1445 | U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); |
michael@0 | 1446 | } |
michael@0 | 1447 | return nextToken; |
michael@0 | 1448 | } |
michael@0 | 1449 | |
michael@0 | 1450 | |
michael@0 | 1451 | /* |
michael@0 | 1452 | Processing Description |
michael@0 | 1453 | 1 Build a ListList. Each list has a header, which contains two lists (positive |
michael@0 | 1454 | and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and |
michael@0 | 1455 | reset may be null. |
michael@0 | 1456 | 2 As you process, you keep a LAST pointer that points to the last token you |
michael@0 | 1457 | handled. |
michael@0 | 1458 | |
michael@0 | 1459 | */ |
michael@0 | 1460 | |
michael@0 | 1461 | static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, |
michael@0 | 1462 | UParseError *parseError, UErrorCode *status) |
michael@0 | 1463 | { |
michael@0 | 1464 | if(src->resultLen == src->listCapacity) { |
michael@0 | 1465 | // Unfortunately, this won't work, as we store addresses of lhs in token |
michael@0 | 1466 | src->listCapacity *= 2; |
michael@0 | 1467 | src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); |
michael@0 | 1468 | if(src->lh == NULL) { |
michael@0 | 1469 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1470 | return NULL; |
michael@0 | 1471 | } |
michael@0 | 1472 | } |
michael@0 | 1473 | /* do the reset thing */ |
michael@0 | 1474 | UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
michael@0 | 1475 | /* test for NULL */ |
michael@0 | 1476 | if (sourceToken == NULL) { |
michael@0 | 1477 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1478 | return NULL; |
michael@0 | 1479 | } |
michael@0 | 1480 | sourceToken->rulesToParseHdl = &(src->source); |
michael@0 | 1481 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
michael@0 | 1482 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
michael@0 | 1483 | |
michael@0 | 1484 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
michael@0 | 1485 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
michael@0 | 1486 | |
michael@0 | 1487 | // keep the flags around so that we know about before |
michael@0 | 1488 | sourceToken->flags = src->parsedToken.flags; |
michael@0 | 1489 | |
michael@0 | 1490 | if(src->parsedToken.prefixOffset != 0) { |
michael@0 | 1491 | // this is a syntax error |
michael@0 | 1492 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1493 | syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); |
michael@0 | 1494 | DBG_FORMAT_ERROR |
michael@0 | 1495 | uprv_free(sourceToken); |
michael@0 | 1496 | return 0; |
michael@0 | 1497 | } else { |
michael@0 | 1498 | sourceToken->prefix = 0; |
michael@0 | 1499 | } |
michael@0 | 1500 | |
michael@0 | 1501 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
michael@0 | 1502 | sourceToken->strength = UCOL_TOK_RESET; |
michael@0 | 1503 | sourceToken->next = NULL; |
michael@0 | 1504 | sourceToken->previous = NULL; |
michael@0 | 1505 | sourceToken->noOfCEs = 0; |
michael@0 | 1506 | sourceToken->noOfExpCEs = 0; |
michael@0 | 1507 | sourceToken->listHeader = &src->lh[src->resultLen]; |
michael@0 | 1508 | |
michael@0 | 1509 | src->lh[src->resultLen].first = NULL; |
michael@0 | 1510 | src->lh[src->resultLen].last = NULL; |
michael@0 | 1511 | src->lh[src->resultLen].first = NULL; |
michael@0 | 1512 | src->lh[src->resultLen].last = NULL; |
michael@0 | 1513 | |
michael@0 | 1514 | src->lh[src->resultLen].reset = sourceToken; |
michael@0 | 1515 | |
michael@0 | 1516 | /* |
michael@0 | 1517 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
michael@0 | 1518 | First convert all expansions into normal form. Examples: |
michael@0 | 1519 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
michael@0 | 1520 | d * ... into &x * c/y * d * ... |
michael@0 | 1521 | Note: reset values can never have expansions, although they can cause the |
michael@0 | 1522 | very next item to have one. They may be contractions, if they are found |
michael@0 | 1523 | earlier in the list. |
michael@0 | 1524 | */ |
michael@0 | 1525 | *expandNext = 0; |
michael@0 | 1526 | if(expand != NULL) { |
michael@0 | 1527 | /* check to see if there is an expansion */ |
michael@0 | 1528 | if(src->parsedToken.charsLen > 1) { |
michael@0 | 1529 | uint32_t resetCharsOffset; |
michael@0 | 1530 | resetCharsOffset = (uint32_t)(expand - src->source); |
michael@0 | 1531 | sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; |
michael@0 | 1532 | *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); |
michael@0 | 1533 | } |
michael@0 | 1534 | } |
michael@0 | 1535 | |
michael@0 | 1536 | src->resultLen++; |
michael@0 | 1537 | |
michael@0 | 1538 | uhash_put(src->tailored, sourceToken, sourceToken, status); |
michael@0 | 1539 | |
michael@0 | 1540 | return sourceToken; |
michael@0 | 1541 | } |
michael@0 | 1542 | |
michael@0 | 1543 | static |
michael@0 | 1544 | inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { |
michael@0 | 1545 | if(U_FAILURE(*status)) { |
michael@0 | 1546 | return NULL; |
michael@0 | 1547 | } |
michael@0 | 1548 | /* this is a virgin before - we need to fish the anchor from the UCA */ |
michael@0 | 1549 | collIterate s; |
michael@0 | 1550 | uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; |
michael@0 | 1551 | uint32_t CE, SecondCE; |
michael@0 | 1552 | // uint32_t invPos; |
michael@0 | 1553 | if(sourceToken != NULL) { |
michael@0 | 1554 | uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); |
michael@0 | 1555 | } else { |
michael@0 | 1556 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); |
michael@0 | 1557 | } |
michael@0 | 1558 | if(U_FAILURE(*status)) { |
michael@0 | 1559 | return NULL; |
michael@0 | 1560 | } |
michael@0 | 1561 | |
michael@0 | 1562 | baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; |
michael@0 | 1563 | baseContCE = ucol_getNextCE(src->UCA, &s, status); |
michael@0 | 1564 | if(baseContCE == UCOL_NO_MORE_CES) { |
michael@0 | 1565 | baseContCE = 0; |
michael@0 | 1566 | } |
michael@0 | 1567 | |
michael@0 | 1568 | |
michael@0 | 1569 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
michael@0 | 1570 | uint32_t ch = 0; |
michael@0 | 1571 | uint32_t expandNext = 0; |
michael@0 | 1572 | UColToken key; |
michael@0 | 1573 | |
michael@0 | 1574 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
michael@0 | 1575 | uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); |
michael@0 | 1576 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
michael@0 | 1577 | ch = uprv_uca_getCodePointFromRaw(raw-1); |
michael@0 | 1578 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
michael@0 | 1579 | CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
michael@0 | 1580 | SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; |
michael@0 | 1581 | |
michael@0 | 1582 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
michael@0 | 1583 | *src->extraCurrent++ = 0xFFFE; |
michael@0 | 1584 | *src->extraCurrent++ = (UChar)ch; |
michael@0 | 1585 | src->parsedToken.charsLen++; |
michael@0 | 1586 | |
michael@0 | 1587 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
michael@0 | 1588 | key.rulesToParseHdl = &(src->source); |
michael@0 | 1589 | |
michael@0 | 1590 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
michael@0 | 1591 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
michael@0 | 1592 | |
michael@0 | 1593 | if(sourceToken == NULL) { |
michael@0 | 1594 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
michael@0 | 1595 | if(isContinuation(SecondCE)) { |
michael@0 | 1596 | src->lh[src->resultLen].baseContCE = SecondCE; |
michael@0 | 1597 | } else { |
michael@0 | 1598 | src->lh[src->resultLen].baseContCE = 0; |
michael@0 | 1599 | } |
michael@0 | 1600 | src->lh[src->resultLen].nextCE = 0; |
michael@0 | 1601 | src->lh[src->resultLen].nextContCE = 0; |
michael@0 | 1602 | src->lh[src->resultLen].previousCE = 0; |
michael@0 | 1603 | src->lh[src->resultLen].previousContCE = 0; |
michael@0 | 1604 | |
michael@0 | 1605 | src->lh[src->resultLen].indirect = FALSE; |
michael@0 | 1606 | |
michael@0 | 1607 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
michael@0 | 1608 | } |
michael@0 | 1609 | |
michael@0 | 1610 | } else { |
michael@0 | 1611 | /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
michael@0 | 1612 | |
michael@0 | 1613 | // we got the previous CE. Now we need to see if the difference between |
michael@0 | 1614 | // the two CEs is really of the requested strength. |
michael@0 | 1615 | // if it's a bigger difference (we asked for secondary and got primary), we |
michael@0 | 1616 | // need to modify the CE. |
michael@0 | 1617 | if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { |
michael@0 | 1618 | // adjust the strength |
michael@0 | 1619 | // now we are in the situation where our baseCE should actually be modified in |
michael@0 | 1620 | // order to get the CE in the right position. |
michael@0 | 1621 | if(strength == UCOL_SECONDARY) { |
michael@0 | 1622 | CE = baseCE - 0x0200; |
michael@0 | 1623 | } else { // strength == UCOL_TERTIARY |
michael@0 | 1624 | CE = baseCE - 0x02; |
michael@0 | 1625 | } |
michael@0 | 1626 | if(baseContCE) { |
michael@0 | 1627 | if(strength == UCOL_SECONDARY) { |
michael@0 | 1628 | SecondCE = baseContCE - 0x0200; |
michael@0 | 1629 | } else { // strength == UCOL_TERTIARY |
michael@0 | 1630 | SecondCE = baseContCE - 0x02; |
michael@0 | 1631 | } |
michael@0 | 1632 | } |
michael@0 | 1633 | } |
michael@0 | 1634 | |
michael@0 | 1635 | #if 0 |
michael@0 | 1636 | // the code below relies on getting a code point from the inverse table, in order to be |
michael@0 | 1637 | // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: |
michael@0 | 1638 | // 1. There are many code points that have the same CE |
michael@0 | 1639 | // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. |
michael@0 | 1640 | // Also, in case when there is no equivalent strength before an element, we have to actually |
michael@0 | 1641 | // construct one. For example, &[before 2]a << x won't result in x << a, because the element |
michael@0 | 1642 | // before a is a primary difference. |
michael@0 | 1643 | |
michael@0 | 1644 | //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
michael@0 | 1645 | |
michael@0 | 1646 | |
michael@0 | 1647 | ch = CETable[3*invPos+2]; |
michael@0 | 1648 | |
michael@0 | 1649 | if((ch & UCOL_INV_SIZEMASK) != 0) { |
michael@0 | 1650 | uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); |
michael@0 | 1651 | uint32_t offset = (ch & UCOL_INV_OFFSETMASK); |
michael@0 | 1652 | ch = conts[offset]; |
michael@0 | 1653 | } |
michael@0 | 1654 | |
michael@0 | 1655 | *src->extraCurrent++ = (UChar)ch; |
michael@0 | 1656 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); |
michael@0 | 1657 | src->parsedToken.charsLen = 1; |
michael@0 | 1658 | |
michael@0 | 1659 | // We got an UCA before. However, this might have been tailored. |
michael@0 | 1660 | // example: |
michael@0 | 1661 | // &\u30ca = \u306a |
michael@0 | 1662 | // &[before 3]\u306a<<<\u306a|\u309d |
michael@0 | 1663 | |
michael@0 | 1664 | |
michael@0 | 1665 | // uint32_t key = (*newCharsLen << 24) | *charsOffset; |
michael@0 | 1666 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
michael@0 | 1667 | key.rulesToParseHdl = &(src->source); |
michael@0 | 1668 | |
michael@0 | 1669 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
michael@0 | 1670 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
michael@0 | 1671 | #endif |
michael@0 | 1672 | |
michael@0 | 1673 | // here is how it should be. The situation such as &[before 1]a < x, should be |
michael@0 | 1674 | // resolved exactly as if we wrote &a > x. |
michael@0 | 1675 | // therefore, I don't really care if the UCA value before a has been changed. |
michael@0 | 1676 | // However, I do care if the strength between my element and the previous element |
michael@0 | 1677 | // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll |
michael@0 | 1678 | // have to construct the base CE. |
michael@0 | 1679 | |
michael@0 | 1680 | |
michael@0 | 1681 | |
michael@0 | 1682 | // if we found a tailored thing, we have to use the UCA value and construct |
michael@0 | 1683 | // a new reset token with constructed name |
michael@0 | 1684 | //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
michael@0 | 1685 | // character to which we want to anchor is already tailored. |
michael@0 | 1686 | // We need to construct a new token which will be the anchor |
michael@0 | 1687 | // point |
michael@0 | 1688 | //*(src->extraCurrent-1) = 0xFFFE; |
michael@0 | 1689 | //*src->extraCurrent++ = (UChar)ch; |
michael@0 | 1690 | // grab before |
michael@0 | 1691 | src->parsedToken.charsOffset -= 10; |
michael@0 | 1692 | src->parsedToken.charsLen += 10; |
michael@0 | 1693 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
michael@0 | 1694 | if(isContinuation(SecondCE)) { |
michael@0 | 1695 | src->lh[src->resultLen].baseContCE = SecondCE; |
michael@0 | 1696 | } else { |
michael@0 | 1697 | src->lh[src->resultLen].baseContCE = 0; |
michael@0 | 1698 | } |
michael@0 | 1699 | src->lh[src->resultLen].nextCE = 0; |
michael@0 | 1700 | src->lh[src->resultLen].nextContCE = 0; |
michael@0 | 1701 | src->lh[src->resultLen].previousCE = 0; |
michael@0 | 1702 | src->lh[src->resultLen].previousContCE = 0; |
michael@0 | 1703 | |
michael@0 | 1704 | src->lh[src->resultLen].indirect = FALSE; |
michael@0 | 1705 | |
michael@0 | 1706 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
michael@0 | 1707 | //} |
michael@0 | 1708 | } |
michael@0 | 1709 | |
michael@0 | 1710 | return sourceToken; |
michael@0 | 1711 | |
michael@0 | 1712 | } |
michael@0 | 1713 | |
michael@0 | 1714 | uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { |
michael@0 | 1715 | UColToken *lastToken = NULL; |
michael@0 | 1716 | const UChar *parseEnd = NULL; |
michael@0 | 1717 | uint32_t expandNext = 0; |
michael@0 | 1718 | UBool variableTop = FALSE; |
michael@0 | 1719 | UBool top = FALSE; |
michael@0 | 1720 | uint16_t specs = 0; |
michael@0 | 1721 | UColTokListHeader *ListList = NULL; |
michael@0 | 1722 | |
michael@0 | 1723 | src->parsedToken.strength = UCOL_TOK_UNSET; |
michael@0 | 1724 | |
michael@0 | 1725 | ListList = src->lh; |
michael@0 | 1726 | |
michael@0 | 1727 | if(U_FAILURE(*status)) { |
michael@0 | 1728 | return 0; |
michael@0 | 1729 | } |
michael@0 | 1730 | #ifdef DEBUG_FOR_CODE_POINTS |
michael@0 | 1731 | char filename[35]; |
michael@0 | 1732 | sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); |
michael@0 | 1733 | dfcp_fp = fopen(filename, "a"); |
michael@0 | 1734 | fprintf(stdout, "Output is in the file %s.\n", filename); |
michael@0 | 1735 | #endif |
michael@0 | 1736 | |
michael@0 | 1737 | #ifdef DEBUG_FOR_COLL_RULES |
michael@0 | 1738 | std::string s3; |
michael@0 | 1739 | UnicodeString(src->source).toUTF8String(s3); |
michael@0 | 1740 | std::cout << "src->source = " << s3 << std::endl; |
michael@0 | 1741 | #endif |
michael@0 | 1742 | |
michael@0 | 1743 | while(src->current < src->end || src->isStarred) { |
michael@0 | 1744 | src->parsedToken.prefixOffset = 0; |
michael@0 | 1745 | |
michael@0 | 1746 | parseEnd = ucol_tok_parseNextToken(src, |
michael@0 | 1747 | (UBool)(lastToken == NULL), |
michael@0 | 1748 | parseError, |
michael@0 | 1749 | status); |
michael@0 | 1750 | |
michael@0 | 1751 | specs = src->parsedToken.flags; |
michael@0 | 1752 | |
michael@0 | 1753 | |
michael@0 | 1754 | variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
michael@0 | 1755 | top = ((specs & UCOL_TOK_TOP) != 0); |
michael@0 | 1756 | |
michael@0 | 1757 | if(U_SUCCESS(*status) && parseEnd != NULL) { |
michael@0 | 1758 | UColToken *sourceToken = NULL; |
michael@0 | 1759 | //uint32_t key = 0; |
michael@0 | 1760 | uint32_t lastStrength = UCOL_TOK_UNSET; |
michael@0 | 1761 | |
michael@0 | 1762 | if(lastToken != NULL ) { |
michael@0 | 1763 | lastStrength = lastToken->strength; |
michael@0 | 1764 | } |
michael@0 | 1765 | |
michael@0 | 1766 | #ifdef DEBUG_FOR_CODE_POINTS |
michael@0 | 1767 | UChar32 cp; |
michael@0 | 1768 | U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); |
michael@0 | 1769 | fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); |
michael@0 | 1770 | #endif |
michael@0 | 1771 | //key = newCharsLen << 24 | charsOffset; |
michael@0 | 1772 | UColToken key; |
michael@0 | 1773 | key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
michael@0 | 1774 | key.rulesToParseHdl = &(src->source); |
michael@0 | 1775 | |
michael@0 | 1776 | /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ |
michael@0 | 1777 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
michael@0 | 1778 | |
michael@0 | 1779 | if(src->parsedToken.strength != UCOL_TOK_RESET) { |
michael@0 | 1780 | if(lastToken == NULL) { /* this means that rules haven't started properly */ |
michael@0 | 1781 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1782 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
michael@0 | 1783 | DBG_FORMAT_ERROR |
michael@0 | 1784 | return 0; |
michael@0 | 1785 | } |
michael@0 | 1786 | /* 6 Otherwise (when relation != reset) */ |
michael@0 | 1787 | if(sourceToken == NULL) { |
michael@0 | 1788 | /* If sourceToken is null, create new one, */ |
michael@0 | 1789 | sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
michael@0 | 1790 | /* test for NULL */ |
michael@0 | 1791 | if (sourceToken == NULL) { |
michael@0 | 1792 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1793 | return 0; |
michael@0 | 1794 | } |
michael@0 | 1795 | sourceToken->rulesToParseHdl = &(src->source); |
michael@0 | 1796 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
michael@0 | 1797 | |
michael@0 | 1798 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
michael@0 | 1799 | |
michael@0 | 1800 | sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; |
michael@0 | 1801 | sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); |
michael@0 | 1802 | |
michael@0 | 1803 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
michael@0 | 1804 | sourceToken->next = NULL; |
michael@0 | 1805 | sourceToken->previous = NULL; |
michael@0 | 1806 | sourceToken->noOfCEs = 0; |
michael@0 | 1807 | sourceToken->noOfExpCEs = 0; |
michael@0 | 1808 | // keep the flags around so that we know about before |
michael@0 | 1809 | sourceToken->flags = src->parsedToken.flags; |
michael@0 | 1810 | uhash_put(src->tailored, sourceToken, sourceToken, status); |
michael@0 | 1811 | if(U_FAILURE(*status)) { |
michael@0 | 1812 | return 0; |
michael@0 | 1813 | } |
michael@0 | 1814 | } else { |
michael@0 | 1815 | /* we could have fished out a reset here */ |
michael@0 | 1816 | if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { |
michael@0 | 1817 | /* otherwise remove sourceToken from where it was. */ |
michael@0 | 1818 | if(sourceToken->next != NULL) { |
michael@0 | 1819 | if(sourceToken->next->strength > sourceToken->strength) { |
michael@0 | 1820 | sourceToken->next->strength = sourceToken->strength; |
michael@0 | 1821 | } |
michael@0 | 1822 | sourceToken->next->previous = sourceToken->previous; |
michael@0 | 1823 | } else { |
michael@0 | 1824 | sourceToken->listHeader->last = sourceToken->previous; |
michael@0 | 1825 | } |
michael@0 | 1826 | |
michael@0 | 1827 | if(sourceToken->previous != NULL) { |
michael@0 | 1828 | sourceToken->previous->next = sourceToken->next; |
michael@0 | 1829 | } else { |
michael@0 | 1830 | sourceToken->listHeader->first = sourceToken->next; |
michael@0 | 1831 | } |
michael@0 | 1832 | sourceToken->next = NULL; |
michael@0 | 1833 | sourceToken->previous = NULL; |
michael@0 | 1834 | } |
michael@0 | 1835 | } |
michael@0 | 1836 | |
michael@0 | 1837 | sourceToken->strength = src->parsedToken.strength; |
michael@0 | 1838 | sourceToken->listHeader = lastToken->listHeader; |
michael@0 | 1839 | |
michael@0 | 1840 | /* |
michael@0 | 1841 | 1. Find the strongest strength in each list, and set strongestP and strongestN |
michael@0 | 1842 | accordingly in the headers. |
michael@0 | 1843 | */ |
michael@0 | 1844 | if(lastStrength == UCOL_TOK_RESET |
michael@0 | 1845 | || sourceToken->listHeader->first == 0) { |
michael@0 | 1846 | /* If LAST is a reset |
michael@0 | 1847 | insert sourceToken in the list. */ |
michael@0 | 1848 | if(sourceToken->listHeader->first == 0) { |
michael@0 | 1849 | sourceToken->listHeader->first = sourceToken; |
michael@0 | 1850 | sourceToken->listHeader->last = sourceToken; |
michael@0 | 1851 | } else { /* we need to find a place for us */ |
michael@0 | 1852 | /* and we'll get in front of the same strength */ |
michael@0 | 1853 | if(sourceToken->listHeader->first->strength <= sourceToken->strength) { |
michael@0 | 1854 | sourceToken->next = sourceToken->listHeader->first; |
michael@0 | 1855 | sourceToken->next->previous = sourceToken; |
michael@0 | 1856 | sourceToken->listHeader->first = sourceToken; |
michael@0 | 1857 | sourceToken->previous = NULL; |
michael@0 | 1858 | } else { |
michael@0 | 1859 | lastToken = sourceToken->listHeader->first; |
michael@0 | 1860 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
michael@0 | 1861 | lastToken = lastToken->next; |
michael@0 | 1862 | } |
michael@0 | 1863 | if(lastToken->next != NULL) { |
michael@0 | 1864 | lastToken->next->previous = sourceToken; |
michael@0 | 1865 | } else { |
michael@0 | 1866 | sourceToken->listHeader->last = sourceToken; |
michael@0 | 1867 | } |
michael@0 | 1868 | sourceToken->previous = lastToken; |
michael@0 | 1869 | sourceToken->next = lastToken->next; |
michael@0 | 1870 | lastToken->next = sourceToken; |
michael@0 | 1871 | } |
michael@0 | 1872 | } |
michael@0 | 1873 | } else { |
michael@0 | 1874 | /* Otherwise (when LAST is not a reset) |
michael@0 | 1875 | if polarity (LAST) == polarity(relation), insert sourceToken after LAST, |
michael@0 | 1876 | otherwise insert before. |
michael@0 | 1877 | when inserting after or before, search to the next position with the same |
michael@0 | 1878 | strength in that direction. (This is called postpone insertion). */ |
michael@0 | 1879 | if(sourceToken != lastToken) { |
michael@0 | 1880 | if(lastToken->polarity == sourceToken->polarity) { |
michael@0 | 1881 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
michael@0 | 1882 | lastToken = lastToken->next; |
michael@0 | 1883 | } |
michael@0 | 1884 | sourceToken->previous = lastToken; |
michael@0 | 1885 | if(lastToken->next != NULL) { |
michael@0 | 1886 | lastToken->next->previous = sourceToken; |
michael@0 | 1887 | } else { |
michael@0 | 1888 | sourceToken->listHeader->last = sourceToken; |
michael@0 | 1889 | } |
michael@0 | 1890 | |
michael@0 | 1891 | sourceToken->next = lastToken->next; |
michael@0 | 1892 | lastToken->next = sourceToken; |
michael@0 | 1893 | } else { |
michael@0 | 1894 | while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { |
michael@0 | 1895 | lastToken = lastToken->previous; |
michael@0 | 1896 | } |
michael@0 | 1897 | sourceToken->next = lastToken; |
michael@0 | 1898 | if(lastToken->previous != NULL) { |
michael@0 | 1899 | lastToken->previous->next = sourceToken; |
michael@0 | 1900 | } else { |
michael@0 | 1901 | sourceToken->listHeader->first = sourceToken; |
michael@0 | 1902 | } |
michael@0 | 1903 | sourceToken->previous = lastToken->previous; |
michael@0 | 1904 | lastToken->previous = sourceToken; |
michael@0 | 1905 | } |
michael@0 | 1906 | } else { /* repeated one thing twice in rules, stay with the stronger strength */ |
michael@0 | 1907 | if(lastStrength < sourceToken->strength) { |
michael@0 | 1908 | sourceToken->strength = lastStrength; |
michael@0 | 1909 | } |
michael@0 | 1910 | } |
michael@0 | 1911 | } |
michael@0 | 1912 | |
michael@0 | 1913 | /* if the token was a variable top, we're gonna put it in */ |
michael@0 | 1914 | if(variableTop == TRUE && src->varTop == NULL) { |
michael@0 | 1915 | variableTop = FALSE; |
michael@0 | 1916 | src->varTop = sourceToken; |
michael@0 | 1917 | } |
michael@0 | 1918 | |
michael@0 | 1919 | // Treat the expansions. |
michael@0 | 1920 | // There are two types of expansions: explicit (x / y) and reset based propagating expansions |
michael@0 | 1921 | // (&abc * d * e <=> &ab * d / c * e / c) |
michael@0 | 1922 | // if both of them are in effect for a token, they are combined. |
michael@0 | 1923 | |
michael@0 | 1924 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
michael@0 | 1925 | |
michael@0 | 1926 | if(expandNext != 0) { |
michael@0 | 1927 | if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ |
michael@0 | 1928 | expandNext = 0; |
michael@0 | 1929 | } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ |
michael@0 | 1930 | sourceToken->expansion = expandNext; |
michael@0 | 1931 | } else { /* there is both explicit and implicit expansion. We need to make a combination */ |
michael@0 | 1932 | uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); |
michael@0 | 1933 | uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); |
michael@0 | 1934 | sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); |
michael@0 | 1935 | src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; |
michael@0 | 1936 | } |
michael@0 | 1937 | } |
michael@0 | 1938 | |
michael@0 | 1939 | // This is just for debugging purposes |
michael@0 | 1940 | if(sourceToken->expansion != 0) { |
michael@0 | 1941 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
michael@0 | 1942 | } else { |
michael@0 | 1943 | sourceToken->debugExpansion = 0; |
michael@0 | 1944 | } |
michael@0 | 1945 | // if the previous token was a reset before, the strength of this |
michael@0 | 1946 | // token must match the strength of before. Otherwise we have an |
michael@0 | 1947 | // undefined situation. |
michael@0 | 1948 | // In other words, we currently have a cludge which we use to |
michael@0 | 1949 | // represent &a >> x. This is written as &[before 2]a << x. |
michael@0 | 1950 | if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { |
michael@0 | 1951 | uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; |
michael@0 | 1952 | if(beforeStrength != sourceToken->strength) { |
michael@0 | 1953 | *status = U_INVALID_FORMAT_ERROR; |
michael@0 | 1954 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
michael@0 | 1955 | DBG_FORMAT_ERROR |
michael@0 | 1956 | return 0; |
michael@0 | 1957 | } |
michael@0 | 1958 | } |
michael@0 | 1959 | } else { |
michael@0 | 1960 | if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
michael@0 | 1961 | /* if the previous token was also a reset, */ |
michael@0 | 1962 | /*this means that we have two consecutive resets */ |
michael@0 | 1963 | /* and we want to remove the previous one if empty*/ |
michael@0 | 1964 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
michael@0 | 1965 | src->resultLen--; |
michael@0 | 1966 | } |
michael@0 | 1967 | } |
michael@0 | 1968 | |
michael@0 | 1969 | if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ |
michael@0 | 1970 | uint32_t searchCharsLen = src->parsedToken.charsLen; |
michael@0 | 1971 | while(searchCharsLen > 1 && sourceToken == NULL) { |
michael@0 | 1972 | searchCharsLen--; |
michael@0 | 1973 | //key = searchCharsLen << 24 | charsOffset; |
michael@0 | 1974 | UColToken key; |
michael@0 | 1975 | key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; |
michael@0 | 1976 | key.rulesToParseHdl = &(src->source); |
michael@0 | 1977 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
michael@0 | 1978 | } |
michael@0 | 1979 | if(sourceToken != NULL) { |
michael@0 | 1980 | expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); |
michael@0 | 1981 | } |
michael@0 | 1982 | } |
michael@0 | 1983 | |
michael@0 | 1984 | if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
michael@0 | 1985 | if(top == FALSE) { /* there is no indirection */ |
michael@0 | 1986 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
michael@0 | 1987 | if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
michael@0 | 1988 | /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ |
michael@0 | 1989 | while(sourceToken->strength > strength && sourceToken->previous != NULL) { |
michael@0 | 1990 | sourceToken = sourceToken->previous; |
michael@0 | 1991 | } |
michael@0 | 1992 | /* here, either we hit the strength or NULL */ |
michael@0 | 1993 | if(sourceToken->strength == strength) { |
michael@0 | 1994 | if(sourceToken->previous != NULL) { |
michael@0 | 1995 | sourceToken = sourceToken->previous; |
michael@0 | 1996 | } else { /* start of list */ |
michael@0 | 1997 | sourceToken = sourceToken->listHeader->reset; |
michael@0 | 1998 | } |
michael@0 | 1999 | } else { /* we hit NULL */ |
michael@0 | 2000 | /* we should be doing the else part */ |
michael@0 | 2001 | sourceToken = sourceToken->listHeader->reset; |
michael@0 | 2002 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
michael@0 | 2003 | } |
michael@0 | 2004 | } else { |
michael@0 | 2005 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
michael@0 | 2006 | } |
michael@0 | 2007 | } else { /* this is both before and indirection */ |
michael@0 | 2008 | top = FALSE; |
michael@0 | 2009 | ListList[src->resultLen].previousCE = 0; |
michael@0 | 2010 | ListList[src->resultLen].previousContCE = 0; |
michael@0 | 2011 | ListList[src->resultLen].indirect = TRUE; |
michael@0 | 2012 | /* we need to do slightly more work. we need to get the baseCE using the */ |
michael@0 | 2013 | /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ |
michael@0 | 2014 | /* in ucol_bld */ |
michael@0 | 2015 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
michael@0 | 2016 | uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
michael@0 | 2017 | uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; |
michael@0 | 2018 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
michael@0 | 2019 | |
michael@0 | 2020 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
michael@0 | 2021 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && |
michael@0 | 2022 | (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
michael@0 | 2023 | uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); |
michael@0 | 2024 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
michael@0 | 2025 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
michael@0 | 2026 | CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
michael@0 | 2027 | SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; |
michael@0 | 2028 | } else { |
michael@0 | 2029 | /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ |
michael@0 | 2030 | ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
michael@0 | 2031 | } |
michael@0 | 2032 | |
michael@0 | 2033 | ListList[src->resultLen].baseCE = CE; |
michael@0 | 2034 | ListList[src->resultLen].baseContCE = SecondCE; |
michael@0 | 2035 | ListList[src->resultLen].nextCE = 0; |
michael@0 | 2036 | ListList[src->resultLen].nextContCE = 0; |
michael@0 | 2037 | |
michael@0 | 2038 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
michael@0 | 2039 | } |
michael@0 | 2040 | } |
michael@0 | 2041 | |
michael@0 | 2042 | |
michael@0 | 2043 | /* 5 If the relation is a reset: |
michael@0 | 2044 | If sourceToken is null |
michael@0 | 2045 | Create new list, create new sourceToken, make the baseCE from source, put |
michael@0 | 2046 | the sourceToken in ListHeader of the new list */ |
michael@0 | 2047 | if(sourceToken == NULL) { |
michael@0 | 2048 | /* |
michael@0 | 2049 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
michael@0 | 2050 | First convert all expansions into normal form. Examples: |
michael@0 | 2051 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
michael@0 | 2052 | d * ... into &x * c/y * d * ... |
michael@0 | 2053 | Note: reset values can never have expansions, although they can cause the |
michael@0 | 2054 | very next item to have one. They may be contractions, if they are found |
michael@0 | 2055 | earlier in the list. |
michael@0 | 2056 | */ |
michael@0 | 2057 | if(top == FALSE) { |
michael@0 | 2058 | collIterate s; |
michael@0 | 2059 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
michael@0 | 2060 | |
michael@0 | 2061 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); |
michael@0 | 2062 | |
michael@0 | 2063 | CE = ucol_getNextCE(src->UCA, &s, status); |
michael@0 | 2064 | const UChar *expand = s.pos; |
michael@0 | 2065 | SecondCE = ucol_getNextCE(src->UCA, &s, status); |
michael@0 | 2066 | |
michael@0 | 2067 | ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
michael@0 | 2068 | if(isContinuation(SecondCE)) { |
michael@0 | 2069 | ListList[src->resultLen].baseContCE = SecondCE; |
michael@0 | 2070 | } else { |
michael@0 | 2071 | ListList[src->resultLen].baseContCE = 0; |
michael@0 | 2072 | } |
michael@0 | 2073 | ListList[src->resultLen].nextCE = 0; |
michael@0 | 2074 | ListList[src->resultLen].nextContCE = 0; |
michael@0 | 2075 | ListList[src->resultLen].previousCE = 0; |
michael@0 | 2076 | ListList[src->resultLen].previousContCE = 0; |
michael@0 | 2077 | ListList[src->resultLen].indirect = FALSE; |
michael@0 | 2078 | sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); |
michael@0 | 2079 | } else { /* top == TRUE */ |
michael@0 | 2080 | /* just use the supplied values */ |
michael@0 | 2081 | top = FALSE; |
michael@0 | 2082 | ListList[src->resultLen].previousCE = 0; |
michael@0 | 2083 | ListList[src->resultLen].previousContCE = 0; |
michael@0 | 2084 | ListList[src->resultLen].indirect = TRUE; |
michael@0 | 2085 | ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
michael@0 | 2086 | ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; |
michael@0 | 2087 | ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; |
michael@0 | 2088 | ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; |
michael@0 | 2089 | |
michael@0 | 2090 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
michael@0 | 2091 | |
michael@0 | 2092 | } |
michael@0 | 2093 | } else { /* reset to something already in rules */ |
michael@0 | 2094 | top = FALSE; |
michael@0 | 2095 | } |
michael@0 | 2096 | } |
michael@0 | 2097 | /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
michael@0 | 2098 | lastToken = sourceToken; |
michael@0 | 2099 | } else { |
michael@0 | 2100 | if(U_FAILURE(*status)) { |
michael@0 | 2101 | return 0; |
michael@0 | 2102 | } |
michael@0 | 2103 | } |
michael@0 | 2104 | } |
michael@0 | 2105 | #ifdef DEBUG_FOR_CODE_POINTS |
michael@0 | 2106 | fclose(dfcp_fp); |
michael@0 | 2107 | #endif |
michael@0 | 2108 | |
michael@0 | 2109 | |
michael@0 | 2110 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
michael@0 | 2111 | src->resultLen--; |
michael@0 | 2112 | } |
michael@0 | 2113 | return src->resultLen; |
michael@0 | 2114 | } |
michael@0 | 2115 | |
michael@0 | 2116 | const UChar* ucol_tok_getRulesFromBundle( |
michael@0 | 2117 | void* /*context*/, |
michael@0 | 2118 | const char* locale, |
michael@0 | 2119 | const char* type, |
michael@0 | 2120 | int32_t* pLength, |
michael@0 | 2121 | UErrorCode* status) |
michael@0 | 2122 | { |
michael@0 | 2123 | const UChar* rules = NULL; |
michael@0 | 2124 | UResourceBundle* bundle; |
michael@0 | 2125 | UResourceBundle* collations; |
michael@0 | 2126 | UResourceBundle* collation; |
michael@0 | 2127 | |
michael@0 | 2128 | *pLength = 0; |
michael@0 | 2129 | |
michael@0 | 2130 | bundle = ures_open(U_ICUDATA_COLL, locale, status); |
michael@0 | 2131 | if(U_SUCCESS(*status)){ |
michael@0 | 2132 | collations = ures_getByKey(bundle, "collations", NULL, status); |
michael@0 | 2133 | if(U_SUCCESS(*status)){ |
michael@0 | 2134 | collation = ures_getByKey(collations, type, NULL, status); |
michael@0 | 2135 | if(U_SUCCESS(*status)){ |
michael@0 | 2136 | rules = ures_getStringByKey(collation, "Sequence", pLength, status); |
michael@0 | 2137 | if(U_FAILURE(*status)){ |
michael@0 | 2138 | *pLength = 0; |
michael@0 | 2139 | rules = NULL; |
michael@0 | 2140 | } |
michael@0 | 2141 | ures_close(collation); |
michael@0 | 2142 | } |
michael@0 | 2143 | ures_close(collations); |
michael@0 | 2144 | } |
michael@0 | 2145 | } |
michael@0 | 2146 | |
michael@0 | 2147 | ures_close(bundle); |
michael@0 | 2148 | |
michael@0 | 2149 | return rules; |
michael@0 | 2150 | } |
michael@0 | 2151 | |
michael@0 | 2152 | void ucol_tok_initTokenList( |
michael@0 | 2153 | UColTokenParser *src, |
michael@0 | 2154 | const UChar *rules, |
michael@0 | 2155 | uint32_t rulesLength, |
michael@0 | 2156 | const UCollator *UCA, |
michael@0 | 2157 | GetCollationRulesFunction importFunc, |
michael@0 | 2158 | void* context, |
michael@0 | 2159 | UErrorCode *status) { |
michael@0 | 2160 | U_NAMESPACE_USE |
michael@0 | 2161 | |
michael@0 | 2162 | uint32_t nSize = 0; |
michael@0 | 2163 | uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); |
michael@0 | 2164 | |
michael@0 | 2165 | bool needToDeallocRules = false; |
michael@0 | 2166 | |
michael@0 | 2167 | if(U_FAILURE(*status)) { |
michael@0 | 2168 | return; |
michael@0 | 2169 | } |
michael@0 | 2170 | |
michael@0 | 2171 | // set everything to zero, so that we can clean up gracefully |
michael@0 | 2172 | uprv_memset(src, 0, sizeof(UColTokenParser)); |
michael@0 | 2173 | |
michael@0 | 2174 | // first we need to find options that don't like to be normalized, |
michael@0 | 2175 | // like copy and remove... |
michael@0 | 2176 | //const UChar *openBrace = rules; |
michael@0 | 2177 | int32_t optionNumber = -1; |
michael@0 | 2178 | const UChar *setStart = NULL; |
michael@0 | 2179 | uint32_t i = 0; |
michael@0 | 2180 | while(i < rulesLength) { |
michael@0 | 2181 | if(rules[i] == 0x005B) { // '[': start of an option |
michael@0 | 2182 | /* Gets the following: |
michael@0 | 2183 | optionNumber: The index of the option. |
michael@0 | 2184 | setStart: The pointer at which the option arguments start. |
michael@0 | 2185 | */ |
michael@0 | 2186 | optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); |
michael@0 | 2187 | |
michael@0 | 2188 | if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ |
michael@0 | 2189 | // [optimize] |
michael@0 | 2190 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
michael@0 | 2191 | if(U_SUCCESS(*status)) { |
michael@0 | 2192 | if(src->copySet == NULL) { |
michael@0 | 2193 | src->copySet = newSet; |
michael@0 | 2194 | } else { |
michael@0 | 2195 | uset_addAll(src->copySet, newSet); |
michael@0 | 2196 | uset_close(newSet); |
michael@0 | 2197 | } |
michael@0 | 2198 | } else { |
michael@0 | 2199 | return; |
michael@0 | 2200 | } |
michael@0 | 2201 | } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { |
michael@0 | 2202 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
michael@0 | 2203 | if(U_SUCCESS(*status)) { |
michael@0 | 2204 | if(src->removeSet == NULL) { |
michael@0 | 2205 | src->removeSet = newSet; |
michael@0 | 2206 | } else { |
michael@0 | 2207 | uset_addAll(src->removeSet, newSet); |
michael@0 | 2208 | uset_close(newSet); |
michael@0 | 2209 | } |
michael@0 | 2210 | } else { |
michael@0 | 2211 | return; |
michael@0 | 2212 | } |
michael@0 | 2213 | } else if(optionNumber == OPTION_IMPORT){ |
michael@0 | 2214 | // [import <collation-name>] |
michael@0 | 2215 | |
michael@0 | 2216 | // Find the address of the closing ]. |
michael@0 | 2217 | UChar* import_end = u_strchr(setStart, 0x005D); |
michael@0 | 2218 | int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); |
michael@0 | 2219 | // Ignore trailing whitespace. |
michael@0 | 2220 | while(PatternProps::isWhiteSpace(*(import_end-1))) { |
michael@0 | 2221 | --import_end; |
michael@0 | 2222 | } |
michael@0 | 2223 | |
michael@0 | 2224 | int32_t optionLength = (int32_t)(import_end - setStart); |
michael@0 | 2225 | char option[50]; |
michael@0 | 2226 | if(optionLength >= (int32_t)sizeof(option)) { |
michael@0 | 2227 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2228 | return; |
michael@0 | 2229 | } |
michael@0 | 2230 | u_UCharsToChars(setStart, option, optionLength); |
michael@0 | 2231 | option[optionLength] = 0; |
michael@0 | 2232 | |
michael@0 | 2233 | *status = U_ZERO_ERROR; |
michael@0 | 2234 | char locale[50]; |
michael@0 | 2235 | int32_t templ; |
michael@0 | 2236 | uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); |
michael@0 | 2237 | if(U_FAILURE(*status)) { |
michael@0 | 2238 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2239 | return; |
michael@0 | 2240 | } |
michael@0 | 2241 | |
michael@0 | 2242 | char type[50]; |
michael@0 | 2243 | if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || |
michael@0 | 2244 | U_FAILURE(*status) |
michael@0 | 2245 | ) { |
michael@0 | 2246 | *status = U_ZERO_ERROR; |
michael@0 | 2247 | uprv_strcpy(type, "standard"); |
michael@0 | 2248 | } |
michael@0 | 2249 | |
michael@0 | 2250 | // TODO: Use public functions when available, see ticket #8134. |
michael@0 | 2251 | char *keywords = (char *)locale_getKeywordsStart(locale); |
michael@0 | 2252 | if(keywords != NULL) { |
michael@0 | 2253 | *keywords = 0; |
michael@0 | 2254 | } |
michael@0 | 2255 | |
michael@0 | 2256 | int32_t importRulesLength = 0; |
michael@0 | 2257 | const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); |
michael@0 | 2258 | |
michael@0 | 2259 | #ifdef DEBUG_FOR_COLL_RULES |
michael@0 | 2260 | std::string s; |
michael@0 | 2261 | UnicodeString(importRules).toUTF8String(s); |
michael@0 | 2262 | std::cout << "Import rules = " << s << std::endl; |
michael@0 | 2263 | #endif |
michael@0 | 2264 | |
michael@0 | 2265 | // Add the length of the imported rules to length of the original rules, |
michael@0 | 2266 | // and subtract the length of the import option. |
michael@0 | 2267 | uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); |
michael@0 | 2268 | |
michael@0 | 2269 | UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); |
michael@0 | 2270 | |
michael@0 | 2271 | #ifdef DEBUG_FOR_COLL_RULES |
michael@0 | 2272 | std::string s1; |
michael@0 | 2273 | UnicodeString(rules).toUTF8String(s1); |
michael@0 | 2274 | std::cout << "Original rules = " << s1 << std::endl; |
michael@0 | 2275 | #endif |
michael@0 | 2276 | |
michael@0 | 2277 | |
michael@0 | 2278 | // Copy the section of the original rules leading up to the import |
michael@0 | 2279 | uprv_memcpy(newRules, rules, i*sizeof(UChar)); |
michael@0 | 2280 | // Copy the imported rules |
michael@0 | 2281 | uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); |
michael@0 | 2282 | // Copy the rest of the original rules (minus the import option itself) |
michael@0 | 2283 | uprv_memcpy(newRules+i+importRulesLength, |
michael@0 | 2284 | rules+optionEndOffset, |
michael@0 | 2285 | (rulesLength-optionEndOffset)*sizeof(UChar)); |
michael@0 | 2286 | |
michael@0 | 2287 | #ifdef DEBUG_FOR_COLL_RULES |
michael@0 | 2288 | std::string s2; |
michael@0 | 2289 | UnicodeString(newRules).toUTF8String(s2); |
michael@0 | 2290 | std::cout << "Resulting rules = " << s2 << std::endl; |
michael@0 | 2291 | #endif |
michael@0 | 2292 | |
michael@0 | 2293 | if(needToDeallocRules){ |
michael@0 | 2294 | // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free |
michael@0 | 2295 | uprv_free((void*)rules); |
michael@0 | 2296 | } |
michael@0 | 2297 | needToDeallocRules = true; |
michael@0 | 2298 | rules = newRules; |
michael@0 | 2299 | rulesLength = newRulesLength; |
michael@0 | 2300 | |
michael@0 | 2301 | estimatedSize += importRulesLength*2; |
michael@0 | 2302 | |
michael@0 | 2303 | // First character of the new rules needs to be processed |
michael@0 | 2304 | i--; |
michael@0 | 2305 | } |
michael@0 | 2306 | } |
michael@0 | 2307 | //openBrace++; |
michael@0 | 2308 | i++; |
michael@0 | 2309 | } |
michael@0 | 2310 | |
michael@0 | 2311 | src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); |
michael@0 | 2312 | /* test for NULL */ |
michael@0 | 2313 | if (src->source == NULL) { |
michael@0 | 2314 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2315 | return; |
michael@0 | 2316 | } |
michael@0 | 2317 | uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); |
michael@0 | 2318 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); |
michael@0 | 2319 | if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 2320 | *status = U_ZERO_ERROR; |
michael@0 | 2321 | src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); |
michael@0 | 2322 | /* test for NULL */ |
michael@0 | 2323 | if (src->source == NULL) { |
michael@0 | 2324 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2325 | return; |
michael@0 | 2326 | } |
michael@0 | 2327 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
michael@0 | 2328 | } |
michael@0 | 2329 | if(needToDeallocRules){ |
michael@0 | 2330 | // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free |
michael@0 | 2331 | uprv_free((void*)rules); |
michael@0 | 2332 | } |
michael@0 | 2333 | |
michael@0 | 2334 | |
michael@0 | 2335 | src->current = src->source; |
michael@0 | 2336 | src->end = src->source+nSize; |
michael@0 | 2337 | src->sourceCurrent = src->source; |
michael@0 | 2338 | src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly |
michael@0 | 2339 | src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
michael@0 | 2340 | src->varTop = NULL; |
michael@0 | 2341 | src->UCA = UCA; |
michael@0 | 2342 | src->invUCA = ucol_initInverseUCA(status); |
michael@0 | 2343 | src->parsedToken.charsLen = 0; |
michael@0 | 2344 | src->parsedToken.charsOffset = 0; |
michael@0 | 2345 | src->parsedToken.extensionLen = 0; |
michael@0 | 2346 | src->parsedToken.extensionOffset = 0; |
michael@0 | 2347 | src->parsedToken.prefixLen = 0; |
michael@0 | 2348 | src->parsedToken.prefixOffset = 0; |
michael@0 | 2349 | src->parsedToken.flags = 0; |
michael@0 | 2350 | src->parsedToken.strength = UCOL_TOK_UNSET; |
michael@0 | 2351 | src->buildCCTabFlag = FALSE; |
michael@0 | 2352 | src->isStarred = FALSE; |
michael@0 | 2353 | src->inRange = FALSE; |
michael@0 | 2354 | src->lastRangeCp = 0; |
michael@0 | 2355 | src->previousCp = 0; |
michael@0 | 2356 | |
michael@0 | 2357 | if(U_FAILURE(*status)) { |
michael@0 | 2358 | return; |
michael@0 | 2359 | } |
michael@0 | 2360 | src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); |
michael@0 | 2361 | if(U_FAILURE(*status)) { |
michael@0 | 2362 | return; |
michael@0 | 2363 | } |
michael@0 | 2364 | uhash_setValueDeleter(src->tailored, uprv_free); |
michael@0 | 2365 | |
michael@0 | 2366 | src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
michael@0 | 2367 | /* test for NULL */ |
michael@0 | 2368 | if (src->opts == NULL) { |
michael@0 | 2369 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2370 | return; |
michael@0 | 2371 | } |
michael@0 | 2372 | |
michael@0 | 2373 | uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); |
michael@0 | 2374 | |
michael@0 | 2375 | src->lh = 0; |
michael@0 | 2376 | src->listCapacity = 1024; |
michael@0 | 2377 | src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); |
michael@0 | 2378 | //Test for NULL |
michael@0 | 2379 | if (src->lh == NULL) { |
michael@0 | 2380 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2381 | return; |
michael@0 | 2382 | } |
michael@0 | 2383 | uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); |
michael@0 | 2384 | src->resultLen = 0; |
michael@0 | 2385 | |
michael@0 | 2386 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
michael@0 | 2387 | |
michael@0 | 2388 | // UCOL_RESET_TOP_VALUE |
michael@0 | 2389 | setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
michael@0 | 2390 | // UCOL_FIRST_PRIMARY_IGNORABLE |
michael@0 | 2391 | setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); |
michael@0 | 2392 | // UCOL_LAST_PRIMARY_IGNORABLE |
michael@0 | 2393 | setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); |
michael@0 | 2394 | // UCOL_FIRST_SECONDARY_IGNORABLE |
michael@0 | 2395 | setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); |
michael@0 | 2396 | // UCOL_LAST_SECONDARY_IGNORABLE |
michael@0 | 2397 | setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); |
michael@0 | 2398 | // UCOL_FIRST_TERTIARY_IGNORABLE |
michael@0 | 2399 | setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); |
michael@0 | 2400 | // UCOL_LAST_TERTIARY_IGNORABLE |
michael@0 | 2401 | setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); |
michael@0 | 2402 | // UCOL_FIRST_VARIABLE |
michael@0 | 2403 | setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); |
michael@0 | 2404 | // UCOL_LAST_VARIABLE |
michael@0 | 2405 | setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); |
michael@0 | 2406 | // UCOL_FIRST_NON_VARIABLE |
michael@0 | 2407 | setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); |
michael@0 | 2408 | // UCOL_LAST_NON_VARIABLE |
michael@0 | 2409 | setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
michael@0 | 2410 | // UCOL_FIRST_IMPLICIT |
michael@0 | 2411 | setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); |
michael@0 | 2412 | // UCOL_LAST_IMPLICIT |
michael@0 | 2413 | setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); |
michael@0 | 2414 | // UCOL_FIRST_TRAILING |
michael@0 | 2415 | setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); |
michael@0 | 2416 | // UCOL_LAST_TRAILING |
michael@0 | 2417 | setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); |
michael@0 | 2418 | ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); |
michael@0 | 2419 | } |
michael@0 | 2420 | |
michael@0 | 2421 | |
michael@0 | 2422 | void ucol_tok_closeTokenList(UColTokenParser *src) { |
michael@0 | 2423 | if(src->copySet != NULL) { |
michael@0 | 2424 | uset_close(src->copySet); |
michael@0 | 2425 | } |
michael@0 | 2426 | if(src->removeSet != NULL) { |
michael@0 | 2427 | uset_close(src->removeSet); |
michael@0 | 2428 | } |
michael@0 | 2429 | if(src->tailored != NULL) { |
michael@0 | 2430 | uhash_close(src->tailored); |
michael@0 | 2431 | } |
michael@0 | 2432 | if(src->lh != NULL) { |
michael@0 | 2433 | uprv_free(src->lh); |
michael@0 | 2434 | } |
michael@0 | 2435 | if(src->source != NULL) { |
michael@0 | 2436 | uprv_free(src->source); |
michael@0 | 2437 | } |
michael@0 | 2438 | if(src->opts != NULL) { |
michael@0 | 2439 | uprv_free(src->opts); |
michael@0 | 2440 | } |
michael@0 | 2441 | if (src->reorderCodes != NULL) { |
michael@0 | 2442 | uprv_free(src->reorderCodes); |
michael@0 | 2443 | } |
michael@0 | 2444 | } |
michael@0 | 2445 | |
michael@0 | 2446 | #endif /* #if !UCONFIG_NO_COLLATION */ |