|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2001-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: ucol_tok.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created 02/22/2001 |
|
14 * created by: Vladimir Weinstein |
|
15 * |
|
16 * This module reads a tailoring rule string and produces a list of |
|
17 * tokens that will be turned into collation elements |
|
18 * |
|
19 */ |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 |
|
23 #if !UCONFIG_NO_COLLATION |
|
24 |
|
25 #include "unicode/uscript.h" |
|
26 #include "unicode/ustring.h" |
|
27 #include "unicode/uchar.h" |
|
28 #include "unicode/uniset.h" |
|
29 |
|
30 #include "cmemory.h" |
|
31 #include "cstring.h" |
|
32 #include "patternprops.h" |
|
33 #include "ucol_bld.h" |
|
34 #include "ucol_tok.h" |
|
35 #include "ulocimp.h" |
|
36 #include "uresimp.h" |
|
37 |
|
38 // Define this only for debugging. |
|
39 // #define DEBUG_FOR_COLL_RULES 1 |
|
40 |
|
41 #ifdef DEBUG_FOR_COLL_RULES |
|
42 #include <iostream> |
|
43 #endif |
|
44 |
|
45 U_NAMESPACE_USE |
|
46 |
|
47 U_CDECL_BEGIN |
|
48 static int32_t U_CALLCONV |
|
49 uhash_hashTokens(const UHashTok k) |
|
50 { |
|
51 int32_t hash = 0; |
|
52 //uint32_t key = (uint32_t)k.integer; |
|
53 UColToken *key = (UColToken *)k.pointer; |
|
54 if (key != 0) { |
|
55 int32_t len = (key->source & 0xFF000000)>>24; |
|
56 int32_t inc = ((len - 32) / 32) + 1; |
|
57 |
|
58 const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); |
|
59 const UChar *limit = p + len; |
|
60 |
|
61 while (p<limit) { |
|
62 hash = (hash * 37) + *p; |
|
63 p += inc; |
|
64 } |
|
65 } |
|
66 return hash; |
|
67 } |
|
68 |
|
69 static UBool U_CALLCONV |
|
70 uhash_compareTokens(const UHashTok key1, const UHashTok key2) |
|
71 { |
|
72 //uint32_t p1 = (uint32_t) key1.integer; |
|
73 //uint32_t p2 = (uint32_t) key2.integer; |
|
74 UColToken *p1 = (UColToken *)key1.pointer; |
|
75 UColToken *p2 = (UColToken *)key2.pointer; |
|
76 const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl); |
|
77 const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); |
|
78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24); |
|
79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24); |
|
80 const UChar *end = s1+s1L-1; |
|
81 |
|
82 if (p1 == p2) { |
|
83 return TRUE; |
|
84 } |
|
85 if (p1->source == 0 || p2->source == 0) { |
|
86 return FALSE; |
|
87 } |
|
88 if(s1L != s2L) { |
|
89 return FALSE; |
|
90 } |
|
91 if(p1->source == p2->source) { |
|
92 return TRUE; |
|
93 } |
|
94 while((s1 < end) && *s1 == *s2) { |
|
95 ++s1; |
|
96 ++s2; |
|
97 } |
|
98 if(*s1 == *s2) { |
|
99 return TRUE; |
|
100 } else { |
|
101 return FALSE; |
|
102 } |
|
103 } |
|
104 U_CDECL_END |
|
105 |
|
106 /* |
|
107 * Debug messages used to pinpoint where a format error occurred. |
|
108 * A better way is to include context-sensitive information in syntaxError() function. |
|
109 * |
|
110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR |
|
111 * in the compile line. |
|
112 */ |
|
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */ |
|
114 |
|
115 #ifdef DEBUG_FOR_FORMAT_ERROR |
|
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} |
|
117 #else |
|
118 #define DBG_FORMAT_ERROR |
|
119 #endif |
|
120 |
|
121 |
|
122 /* |
|
123 * Controls debug messages so that the output can be compared before and after a |
|
124 * big change. Prints the information of every code point that comes out of the |
|
125 * collation parser and its strength into a file. When a big change in format |
|
126 * happens, the files before and after the change should be identical. |
|
127 * |
|
128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS |
|
129 * in the compile line. |
|
130 */ |
|
131 // #define DEBUG_FOR_CODE_POINTS 1 |
|
132 |
|
133 #ifdef DEBUG_FOR_CODE_POINTS |
|
134 FILE* dfcp_fp = NULL; |
|
135 #endif |
|
136 |
|
137 |
|
138 typedef struct { |
|
139 uint32_t startCE; |
|
140 uint32_t startContCE; |
|
141 uint32_t limitCE; |
|
142 uint32_t limitContCE; |
|
143 } indirectBoundaries; |
|
144 |
|
145 /* these values are used for finding CE values for indirect positioning. */ |
|
146 /* Indirect positioning is a mechanism for allowing resets on symbolic */ |
|
147 /* values. It only works for resets and you cannot tailor indirect names */ |
|
148 /* An indirect name can define either an anchor point or a range. An */ |
|
149 /* anchor point behaves in exactly the same way as a code point in reset */ |
|
150 /* would, except that it cannot be tailored. A range (we currently only */ |
|
151 /* know for the [top] range will explicitly set the upper bound for */ |
|
152 /* generated CEs, thus allowing for better control over how many CEs can */ |
|
153 /* be squeezed between in the range without performance penalty. */ |
|
154 /* In that respect, we use [top] for tailoring of locales that use CJK */ |
|
155 /* characters. Other indirect values are currently a pure convenience, */ |
|
156 /* they can be used to assure that the CEs will be always positioned in */ |
|
157 /* the same place relative to a point with known properties (e.g. first */ |
|
158 /* primary ignorable). */ |
|
159 static indirectBoundaries ucolIndirectBoundaries[15]; |
|
160 /* |
|
161 static indirectBoundaries ucolIndirectBoundaries[11] = { |
|
162 { UCOL_RESET_TOP_VALUE, 0, |
|
163 UCOL_NEXT_TOP_VALUE, 0 }, |
|
164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0, |
|
165 0, 0 }, |
|
166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, |
|
167 0, 0 }, |
|
168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0, |
|
169 0, 0 }, |
|
170 { UCOL_LAST_SECONDARY_IGNORABLE, 0, |
|
171 0, 0 }, |
|
172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0, |
|
173 0, 0 }, |
|
174 { UCOL_LAST_TERTIARY_IGNORABLE, 0, |
|
175 0, 0 }, |
|
176 { UCOL_FIRST_VARIABLE, 0, |
|
177 0, 0 }, |
|
178 { UCOL_LAST_VARIABLE, 0, |
|
179 0, 0 }, |
|
180 { UCOL_FIRST_NON_VARIABLE, 0, |
|
181 0, 0 }, |
|
182 { UCOL_LAST_NON_VARIABLE, 0, |
|
183 0, 0 }, |
|
184 }; |
|
185 */ |
|
186 |
|
187 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { |
|
188 |
|
189 // Set values for the top - TODO: once we have values for all the indirects, we are going |
|
190 // to initalize here. |
|
191 ucolIndirectBoundaries[indexR].startCE = start[0]; |
|
192 ucolIndirectBoundaries[indexR].startContCE = start[1]; |
|
193 if(end) { |
|
194 ucolIndirectBoundaries[indexR].limitCE = end[0]; |
|
195 ucolIndirectBoundaries[indexR].limitContCE = end[1]; |
|
196 } else { |
|
197 ucolIndirectBoundaries[indexR].limitCE = 0; |
|
198 ucolIndirectBoundaries[indexR].limitContCE = 0; |
|
199 } |
|
200 } |
|
201 |
|
202 |
|
203 static inline |
|
204 void syntaxError(const UChar* rules, |
|
205 int32_t pos, |
|
206 int32_t rulesLen, |
|
207 UParseError* parseError) |
|
208 { |
|
209 parseError->offset = pos; |
|
210 parseError->line = 0 ; /* we are not using line numbers */ |
|
211 |
|
212 // for pre-context |
|
213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); |
|
214 int32_t stop = pos; |
|
215 |
|
216 u_memcpy(parseError->preContext,rules+start,stop-start); |
|
217 //null terminate the buffer |
|
218 parseError->preContext[stop-start] = 0; |
|
219 |
|
220 //for post-context |
|
221 start = pos+1; |
|
222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : |
|
223 rulesLen; |
|
224 |
|
225 if(start < stop) { |
|
226 u_memcpy(parseError->postContext,rules+start,stop-start); |
|
227 //null terminate the buffer |
|
228 parseError->postContext[stop-start]= 0; |
|
229 } else { |
|
230 parseError->postContext[0] = 0; |
|
231 } |
|
232 } |
|
233 |
|
234 static |
|
235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { |
|
236 switch(attrib) { |
|
237 case UCOL_HIRAGANA_QUATERNARY_MODE: |
|
238 opts->hiraganaQ = value; |
|
239 break; |
|
240 case UCOL_FRENCH_COLLATION: |
|
241 opts->frenchCollation = value; |
|
242 break; |
|
243 case UCOL_ALTERNATE_HANDLING: |
|
244 opts->alternateHandling = value; |
|
245 break; |
|
246 case UCOL_CASE_FIRST: |
|
247 opts->caseFirst = value; |
|
248 break; |
|
249 case UCOL_CASE_LEVEL: |
|
250 opts->caseLevel = value; |
|
251 break; |
|
252 case UCOL_NORMALIZATION_MODE: |
|
253 opts->normalizationMode = value; |
|
254 break; |
|
255 case UCOL_STRENGTH: |
|
256 opts->strength = value; |
|
257 break; |
|
258 case UCOL_NUMERIC_COLLATION: |
|
259 opts->numericCollation = value; |
|
260 break; |
|
261 case UCOL_ATTRIBUTE_COUNT: |
|
262 default: |
|
263 break; |
|
264 } |
|
265 } |
|
266 |
|
267 #define UTOK_OPTION_COUNT 22 |
|
268 |
|
269 static UBool didInit = FALSE; |
|
270 /* we can be strict, or we can be lenient */ |
|
271 /* I'd surely be lenient with the option arguments */ |
|
272 /* maybe even with options */ |
|
273 U_STRING_DECL(suboption_00, "non-ignorable", 13); |
|
274 U_STRING_DECL(suboption_01, "shifted", 7); |
|
275 |
|
276 U_STRING_DECL(suboption_02, "lower", 5); |
|
277 U_STRING_DECL(suboption_03, "upper", 5); |
|
278 U_STRING_DECL(suboption_04, "off", 3); |
|
279 U_STRING_DECL(suboption_05, "on", 2); |
|
280 U_STRING_DECL(suboption_06, "1", 1); |
|
281 U_STRING_DECL(suboption_07, "2", 1); |
|
282 U_STRING_DECL(suboption_08, "3", 1); |
|
283 U_STRING_DECL(suboption_09, "4", 1); |
|
284 U_STRING_DECL(suboption_10, "I", 1); |
|
285 |
|
286 U_STRING_DECL(suboption_11, "primary", 7); |
|
287 U_STRING_DECL(suboption_12, "secondary", 9); |
|
288 U_STRING_DECL(suboption_13, "tertiary", 8); |
|
289 U_STRING_DECL(suboption_14, "variable", 8); |
|
290 U_STRING_DECL(suboption_15, "regular", 7); |
|
291 U_STRING_DECL(suboption_16, "implicit", 8); |
|
292 U_STRING_DECL(suboption_17, "trailing", 8); |
|
293 |
|
294 |
|
295 U_STRING_DECL(option_00, "undefined", 9); |
|
296 U_STRING_DECL(option_01, "rearrange", 9); |
|
297 U_STRING_DECL(option_02, "alternate", 9); |
|
298 U_STRING_DECL(option_03, "backwards", 9); |
|
299 U_STRING_DECL(option_04, "variable top", 12); |
|
300 U_STRING_DECL(option_05, "top", 3); |
|
301 U_STRING_DECL(option_06, "normalization", 13); |
|
302 U_STRING_DECL(option_07, "caseLevel", 9); |
|
303 U_STRING_DECL(option_08, "caseFirst", 9); |
|
304 U_STRING_DECL(option_09, "scriptOrder", 11); |
|
305 U_STRING_DECL(option_10, "charsetname", 11); |
|
306 U_STRING_DECL(option_11, "charset", 7); |
|
307 U_STRING_DECL(option_12, "before", 6); |
|
308 U_STRING_DECL(option_13, "hiraganaQ", 9); |
|
309 U_STRING_DECL(option_14, "strength", 8); |
|
310 U_STRING_DECL(option_15, "first", 5); |
|
311 U_STRING_DECL(option_16, "last", 4); |
|
312 U_STRING_DECL(option_17, "optimize", 8); |
|
313 U_STRING_DECL(option_18, "suppressContractions", 20); |
|
314 U_STRING_DECL(option_19, "numericOrdering", 15); |
|
315 U_STRING_DECL(option_20, "import", 6); |
|
316 U_STRING_DECL(option_21, "reorder", 7); |
|
317 |
|
318 /* |
|
319 [last variable] last variable value |
|
320 [last primary ignorable] largest CE for primary ignorable |
|
321 [last secondary ignorable] largest CE for secondary ignorable |
|
322 [last tertiary ignorable] largest CE for tertiary ignorable |
|
323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) |
|
324 */ |
|
325 |
|
326 |
|
327 static const ucolTokSuboption alternateSub[2] = { |
|
328 {suboption_00, 13, UCOL_NON_IGNORABLE}, |
|
329 {suboption_01, 7, UCOL_SHIFTED} |
|
330 }; |
|
331 |
|
332 static const ucolTokSuboption caseFirstSub[3] = { |
|
333 {suboption_02, 5, UCOL_LOWER_FIRST}, |
|
334 {suboption_03, 5, UCOL_UPPER_FIRST}, |
|
335 {suboption_04, 3, UCOL_OFF}, |
|
336 }; |
|
337 |
|
338 static const ucolTokSuboption onOffSub[2] = { |
|
339 {suboption_04, 3, UCOL_OFF}, |
|
340 {suboption_05, 2, UCOL_ON} |
|
341 }; |
|
342 |
|
343 static const ucolTokSuboption frenchSub[1] = { |
|
344 {suboption_07, 1, UCOL_ON} |
|
345 }; |
|
346 |
|
347 static const ucolTokSuboption beforeSub[3] = { |
|
348 {suboption_06, 1, UCOL_PRIMARY}, |
|
349 {suboption_07, 1, UCOL_SECONDARY}, |
|
350 {suboption_08, 1, UCOL_TERTIARY} |
|
351 }; |
|
352 |
|
353 static const ucolTokSuboption strengthSub[5] = { |
|
354 {suboption_06, 1, UCOL_PRIMARY}, |
|
355 {suboption_07, 1, UCOL_SECONDARY}, |
|
356 {suboption_08, 1, UCOL_TERTIARY}, |
|
357 {suboption_09, 1, UCOL_QUATERNARY}, |
|
358 {suboption_10, 1, UCOL_IDENTICAL}, |
|
359 }; |
|
360 |
|
361 static const ucolTokSuboption firstLastSub[7] = { |
|
362 {suboption_11, 7, UCOL_PRIMARY}, |
|
363 {suboption_12, 9, UCOL_PRIMARY}, |
|
364 {suboption_13, 8, UCOL_PRIMARY}, |
|
365 {suboption_14, 8, UCOL_PRIMARY}, |
|
366 {suboption_15, 7, UCOL_PRIMARY}, |
|
367 {suboption_16, 8, UCOL_PRIMARY}, |
|
368 {suboption_17, 8, UCOL_PRIMARY}, |
|
369 }; |
|
370 |
|
371 enum OptionNumber { |
|
372 OPTION_ALTERNATE_HANDLING = 0, |
|
373 OPTION_FRENCH_COLLATION, |
|
374 OPTION_CASE_LEVEL, |
|
375 OPTION_CASE_FIRST, |
|
376 OPTION_NORMALIZATION_MODE, |
|
377 OPTION_HIRAGANA_QUATERNARY, |
|
378 OPTION_STRENGTH, |
|
379 OPTION_NUMERIC_COLLATION, |
|
380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, |
|
381 OPTION_VARIABLE_TOP, |
|
382 OPTION_REARRANGE, |
|
383 OPTION_BEFORE, |
|
384 OPTION_TOP, |
|
385 OPTION_FIRST, |
|
386 OPTION_LAST, |
|
387 OPTION_OPTIMIZE, |
|
388 OPTION_SUPPRESS_CONTRACTIONS, |
|
389 OPTION_UNDEFINED, |
|
390 OPTION_SCRIPT_ORDER, |
|
391 OPTION_CHARSET_NAME, |
|
392 OPTION_CHARSET, |
|
393 OPTION_IMPORT, |
|
394 OPTION_SCRIPTREORDER |
|
395 } ; |
|
396 |
|
397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { |
|
398 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ |
|
399 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ |
|
400 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ |
|
401 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ |
|
402 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ |
|
403 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ |
|
404 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ |
|
405 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ |
|
406 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
|
407 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ |
|
408 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ |
|
409 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ |
|
410 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ |
|
411 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ |
|
412 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ |
|
413 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ |
|
414 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ |
|
415 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ |
|
416 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ |
|
417 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ |
|
418 /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ |
|
419 /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ |
|
420 }; |
|
421 |
|
422 static |
|
423 int32_t u_strncmpNoCase(const UChar *s1, |
|
424 const UChar *s2, |
|
425 int32_t n) |
|
426 { |
|
427 if(n > 0) { |
|
428 int32_t rc; |
|
429 for(;;) { |
|
430 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); |
|
431 if(rc != 0 || *s1 == 0 || --n == 0) { |
|
432 return rc; |
|
433 } |
|
434 ++s1; |
|
435 ++s2; |
|
436 } |
|
437 } |
|
438 return 0; |
|
439 } |
|
440 |
|
441 static |
|
442 void ucol_uprv_tok_initData() { |
|
443 if(!didInit) { |
|
444 U_STRING_INIT(suboption_00, "non-ignorable", 13); |
|
445 U_STRING_INIT(suboption_01, "shifted", 7); |
|
446 |
|
447 U_STRING_INIT(suboption_02, "lower", 5); |
|
448 U_STRING_INIT(suboption_03, "upper", 5); |
|
449 U_STRING_INIT(suboption_04, "off", 3); |
|
450 U_STRING_INIT(suboption_05, "on", 2); |
|
451 |
|
452 U_STRING_INIT(suboption_06, "1", 1); |
|
453 U_STRING_INIT(suboption_07, "2", 1); |
|
454 U_STRING_INIT(suboption_08, "3", 1); |
|
455 U_STRING_INIT(suboption_09, "4", 1); |
|
456 U_STRING_INIT(suboption_10, "I", 1); |
|
457 |
|
458 U_STRING_INIT(suboption_11, "primary", 7); |
|
459 U_STRING_INIT(suboption_12, "secondary", 9); |
|
460 U_STRING_INIT(suboption_13, "tertiary", 8); |
|
461 U_STRING_INIT(suboption_14, "variable", 8); |
|
462 U_STRING_INIT(suboption_15, "regular", 7); |
|
463 U_STRING_INIT(suboption_16, "implicit", 8); |
|
464 U_STRING_INIT(suboption_17, "trailing", 8); |
|
465 |
|
466 |
|
467 U_STRING_INIT(option_00, "undefined", 9); |
|
468 U_STRING_INIT(option_01, "rearrange", 9); |
|
469 U_STRING_INIT(option_02, "alternate", 9); |
|
470 U_STRING_INIT(option_03, "backwards", 9); |
|
471 U_STRING_INIT(option_04, "variable top", 12); |
|
472 U_STRING_INIT(option_05, "top", 3); |
|
473 U_STRING_INIT(option_06, "normalization", 13); |
|
474 U_STRING_INIT(option_07, "caseLevel", 9); |
|
475 U_STRING_INIT(option_08, "caseFirst", 9); |
|
476 U_STRING_INIT(option_09, "scriptOrder", 11); |
|
477 U_STRING_INIT(option_10, "charsetname", 11); |
|
478 U_STRING_INIT(option_11, "charset", 7); |
|
479 U_STRING_INIT(option_12, "before", 6); |
|
480 U_STRING_INIT(option_13, "hiraganaQ", 9); |
|
481 U_STRING_INIT(option_14, "strength", 8); |
|
482 U_STRING_INIT(option_15, "first", 5); |
|
483 U_STRING_INIT(option_16, "last", 4); |
|
484 U_STRING_INIT(option_17, "optimize", 8); |
|
485 U_STRING_INIT(option_18, "suppressContractions", 20); |
|
486 U_STRING_INIT(option_19, "numericOrdering", 15); |
|
487 U_STRING_INIT(option_20, "import ", 6); |
|
488 U_STRING_INIT(option_21, "reorder", 7); |
|
489 didInit = TRUE; |
|
490 } |
|
491 } |
|
492 |
|
493 |
|
494 // This function reads basic options to set in the runtime collator |
|
495 // used by data driven tests. Should not support build time options |
|
496 U_CAPI const UChar * U_EXPORT2 |
|
497 ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
|
498 UColAttribute *attrib, UColAttributeValue *value, |
|
499 UErrorCode *status) |
|
500 { |
|
501 uint32_t i = 0; |
|
502 int32_t j=0; |
|
503 UBool foundOption = FALSE; |
|
504 const UChar *optionArg = NULL; |
|
505 |
|
506 ucol_uprv_tok_initData(); |
|
507 |
|
508 while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ |
|
509 start++; |
|
510 } |
|
511 if(start >= end) { |
|
512 return NULL; |
|
513 } |
|
514 /* skip opening '[' */ |
|
515 if(*start == 0x005b) { |
|
516 start++; |
|
517 } else { |
|
518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' |
|
519 return NULL; |
|
520 } |
|
521 |
|
522 while(i < UTOK_OPTION_COUNT) { |
|
523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
|
524 foundOption = TRUE; |
|
525 if(end - start > rulesOptions[i].optionLen) { |
|
526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ |
|
527 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ |
|
528 optionArg++; |
|
529 } |
|
530 } |
|
531 break; |
|
532 } |
|
533 i++; |
|
534 } |
|
535 |
|
536 if(!foundOption) { |
|
537 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
538 return NULL; |
|
539 } |
|
540 |
|
541 if(optionArg) { |
|
542 for(j = 0; j<rulesOptions[i].subSize; j++) { |
|
543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
|
544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
|
545 *attrib = rulesOptions[i].attr; |
|
546 *value = rulesOptions[i].subopts[j].attrVal; |
|
547 optionArg += rulesOptions[i].subopts[j].subLen; |
|
548 while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ |
|
549 optionArg++; |
|
550 } |
|
551 if(*optionArg == 0x005d) { |
|
552 optionArg++; |
|
553 return optionArg; |
|
554 } else { |
|
555 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
556 return NULL; |
|
557 } |
|
558 } |
|
559 } |
|
560 } |
|
561 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
562 return NULL; |
|
563 } |
|
564 |
|
565 static |
|
566 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { |
|
567 while(*start != 0x005b) { /* advance while we find the first '[' */ |
|
568 start++; |
|
569 } |
|
570 // now we need to get a balanced set of '[]'. The problem is that a set can have |
|
571 // many, and *end point to the first closing '[' |
|
572 int32_t noOpenBraces = 1; |
|
573 int32_t current = 1; // skip the opening brace |
|
574 while(start+current < end && noOpenBraces != 0) { |
|
575 if(start[current] == 0x005b) { |
|
576 noOpenBraces++; |
|
577 } else if(start[current] == 0x005D) { // closing brace |
|
578 noOpenBraces--; |
|
579 } |
|
580 current++; |
|
581 } |
|
582 |
|
583 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { |
|
584 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
585 return NULL; |
|
586 } |
|
587 return uset_openPattern(start, current, status); |
|
588 } |
|
589 |
|
590 /** |
|
591 * Reads an option and matches the option name with the predefined options. (Case-insensitive.) |
|
592 * @param start Pointer to the start UChar. |
|
593 * @param end Pointer to the last valid pointer beyond which the option will not extend. |
|
594 * @param optionArg Address of the pointer at which the options start (after the option name) |
|
595 * @return The index of the option, or -1 if the option is not valid. |
|
596 */ |
|
597 static |
|
598 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { |
|
599 int32_t i = 0; |
|
600 ucol_uprv_tok_initData(); |
|
601 |
|
602 while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ |
|
603 start++; |
|
604 } |
|
605 while(i < UTOK_OPTION_COUNT) { |
|
606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
|
607 if(end - start > rulesOptions[i].optionLen) { |
|
608 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ |
|
609 while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */ |
|
610 (*optionArg)++; |
|
611 } |
|
612 } |
|
613 break; |
|
614 } |
|
615 i++; |
|
616 } |
|
617 if(i == UTOK_OPTION_COUNT) { |
|
618 i = -1; // didn't find an option |
|
619 } |
|
620 return i; |
|
621 } |
|
622 |
|
623 |
|
624 static |
|
625 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { |
|
626 int32_t codeCount = 0; |
|
627 int32_t codeIndex = 0; |
|
628 char conversion[64]; |
|
629 int32_t tokenLength = 0; |
|
630 const UChar* space; |
|
631 |
|
632 const UChar* current = src->current; |
|
633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); |
|
634 |
|
635 // eat leading whitespace |
|
636 while(current < end && u_isWhitespace(*current)) { |
|
637 current++; |
|
638 } |
|
639 |
|
640 while(current < end) { |
|
641 space = u_memchr(current, 0x0020, end - current); |
|
642 space = space == 0 ? end : space; |
|
643 tokenLength = space - current; |
|
644 if (tokenLength < 4) { |
|
645 *status = U_INVALID_FORMAT_ERROR; |
|
646 return; |
|
647 } |
|
648 codeCount++; |
|
649 current += tokenLength; |
|
650 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
|
651 ++current; |
|
652 } |
|
653 } |
|
654 |
|
655 if (codeCount == 0) { |
|
656 *status = U_INVALID_FORMAT_ERROR; |
|
657 } |
|
658 |
|
659 src->reorderCodesLength = codeCount; |
|
660 src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); |
|
661 current = src->current; |
|
662 |
|
663 // eat leading whitespace |
|
664 while(current < end && u_isWhitespace(*current)) { |
|
665 current++; |
|
666 } |
|
667 |
|
668 while(current < end) { |
|
669 space = u_memchr(current, 0x0020, end - current); |
|
670 space = space == 0 ? end : space; |
|
671 tokenLength = space - current; |
|
672 if (tokenLength < 4) { |
|
673 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
674 return; |
|
675 } else { |
|
676 u_UCharsToChars(current, conversion, tokenLength); |
|
677 conversion[tokenLength] = '\0'; |
|
678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); |
|
679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
|
680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); |
|
681 } |
|
682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { |
|
683 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
684 } |
|
685 } |
|
686 codeIndex++; |
|
687 current += tokenLength; |
|
688 while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ |
|
689 ++current; |
|
690 } |
|
691 } |
|
692 } |
|
693 |
|
694 // reads and conforms to various options in rules |
|
695 // end is the position of the first closing ']' |
|
696 // However, some of the options take an UnicodeSet definition |
|
697 // which needs to duplicate the closing ']' |
|
698 // for example: '[copy [\uAC00-\uD7FF]]' |
|
699 // These options will move end to the second ']' and the |
|
700 // caller will set the current to it. |
|
701 static |
|
702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { |
|
703 const UChar* start = src->current; |
|
704 int32_t i = 0; |
|
705 int32_t j=0; |
|
706 const UChar *optionArg = NULL; |
|
707 |
|
708 uint8_t result = 0; |
|
709 |
|
710 start++; /*skip opening '['*/ |
|
711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg); |
|
712 if(optionArg) { |
|
713 src->current = optionArg; |
|
714 } |
|
715 |
|
716 if(i < 0) { |
|
717 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
718 } else { |
|
719 int32_t noOpenBraces = 1; |
|
720 switch(i) { |
|
721 case OPTION_ALTERNATE_HANDLING: |
|
722 case OPTION_FRENCH_COLLATION: |
|
723 case OPTION_CASE_LEVEL: |
|
724 case OPTION_CASE_FIRST: |
|
725 case OPTION_NORMALIZATION_MODE: |
|
726 case OPTION_HIRAGANA_QUATERNARY: |
|
727 case OPTION_STRENGTH: |
|
728 case OPTION_NUMERIC_COLLATION: |
|
729 if(optionArg) { |
|
730 for(j = 0; j<rulesOptions[i].subSize; j++) { |
|
731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
|
732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
|
733 result = UCOL_TOK_SUCCESS; |
|
734 } |
|
735 } |
|
736 } |
|
737 if(result == 0) { |
|
738 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
739 } |
|
740 break; |
|
741 case OPTION_VARIABLE_TOP: |
|
742 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
|
743 break; |
|
744 case OPTION_REARRANGE: |
|
745 result = UCOL_TOK_SUCCESS; |
|
746 break; |
|
747 case OPTION_BEFORE: |
|
748 if(optionArg) { |
|
749 for(j = 0; j<rulesOptions[i].subSize; j++) { |
|
750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
|
751 result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1); |
|
752 } |
|
753 } |
|
754 } |
|
755 if(result == 0) { |
|
756 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
757 } |
|
758 break; |
|
759 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ |
|
760 /* index to this array will be src->parsedToken.indirectIndex*/ |
|
761 src->parsedToken.indirectIndex = 0; |
|
762 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; |
|
763 break; |
|
764 case OPTION_FIRST: |
|
765 case OPTION_LAST: /* first, last */ |
|
766 for(j = 0; j<rulesOptions[i].subSize; j++) { |
|
767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
|
768 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first |
|
769 // element of indirect boundaries is reserved for top. |
|
770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); |
|
771 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; |
|
772 } |
|
773 } |
|
774 if(result == 0) { |
|
775 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
776 } |
|
777 break; |
|
778 case OPTION_OPTIMIZE: |
|
779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization |
|
780 // we need to move end here |
|
781 src->current++; // skip opening brace |
|
782 while(src->current < src->end && noOpenBraces != 0) { |
|
783 if(*src->current == 0x005b) { |
|
784 noOpenBraces++; |
|
785 } else if(*src->current == 0x005D) { // closing brace |
|
786 noOpenBraces--; |
|
787 } |
|
788 src->current++; |
|
789 } |
|
790 result = UCOL_TOK_SUCCESS; |
|
791 break; |
|
792 case OPTION_SCRIPTREORDER: |
|
793 ucol_tok_parseScriptReorder(src, status); |
|
794 break; |
|
795 default: |
|
796 *status = U_UNSUPPORTED_ERROR; |
|
797 break; |
|
798 } |
|
799 } |
|
800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); |
|
801 return result; |
|
802 } |
|
803 |
|
804 |
|
805 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { |
|
806 if (stuff == NULL || len <= 0) { |
|
807 return; |
|
808 } |
|
809 UnicodeString tempStuff(FALSE, stuff, len); |
|
810 if(src->extraCurrent+len >= src->extraEnd) { |
|
811 /* reallocate */ |
|
812 if (stuff >= src->source && stuff <= src->end) { |
|
813 // Copy the "stuff" contents into tempStuff's own buffer. |
|
814 // UnicodeString is copy-on-write. |
|
815 if (len > 0) { |
|
816 tempStuff.setCharAt(0, tempStuff[0]); |
|
817 } else { |
|
818 tempStuff.remove(); |
|
819 } |
|
820 } |
|
821 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); |
|
822 if(newSrc != NULL) { |
|
823 src->current = newSrc + (src->current - src->source); |
|
824 src->extraCurrent = newSrc + (src->extraCurrent - src->source); |
|
825 src->end = newSrc + (src->end - src->source); |
|
826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2; |
|
827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); |
|
828 src->source = newSrc; |
|
829 } else { |
|
830 *status = U_MEMORY_ALLOCATION_ERROR; |
|
831 return; |
|
832 } |
|
833 } |
|
834 if(len == 1) { |
|
835 *src->extraCurrent++ = tempStuff[0]; |
|
836 } else { |
|
837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); |
|
838 src->extraCurrent += len; |
|
839 } |
|
840 } |
|
841 |
|
842 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { |
|
843 /* |
|
844 top = TRUE; |
|
845 */ |
|
846 UChar buff[5]; |
|
847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
848 buff[0] = 0xFFFE; |
|
849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); |
|
850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); |
|
851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { |
|
852 src->parsedToken.charsLen = 3; |
|
853 ucol_tok_addToExtraCurrent(src, buff, 3, status); |
|
854 } else { |
|
855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); |
|
856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); |
|
857 src->parsedToken.charsLen = 5; |
|
858 ucol_tok_addToExtraCurrent(src, buff, 5, status); |
|
859 } |
|
860 return TRUE; |
|
861 } |
|
862 |
|
863 static UBool isCharNewLine(UChar c){ |
|
864 switch(c){ |
|
865 case 0x000A: /* LF */ |
|
866 case 0x000D: /* CR */ |
|
867 case 0x000C: /* FF */ |
|
868 case 0x0085: /* NEL */ |
|
869 case 0x2028: /* LS */ |
|
870 case 0x2029: /* PS */ |
|
871 return TRUE; |
|
872 default: |
|
873 return FALSE; |
|
874 } |
|
875 } |
|
876 |
|
877 /* |
|
878 * This function is called several times when a range is processed. Each time, the next code point |
|
879 * is processed. |
|
880 * The following variables must be set before calling this function: |
|
881 * src->currentRangeCp: The current code point to process. |
|
882 * src->lastRangeCp: The last code point in the range. |
|
883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. |
|
884 */ |
|
885 static const UChar* |
|
886 ucol_tok_processNextCodePointInRange(UColTokenParser *src, |
|
887 UErrorCode *status) |
|
888 { |
|
889 // Append current code point to source |
|
890 UChar buff[U16_MAX_LENGTH]; |
|
891 uint32_t i = 0; |
|
892 |
|
893 uint32_t nChars = U16_LENGTH(src->currentRangeCp); |
|
894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
895 src->parsedToken.charsLen = nChars; |
|
896 |
|
897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); |
|
898 ucol_tok_addToExtraCurrent(src, buff, nChars, status); |
|
899 |
|
900 ++src->currentRangeCp; |
|
901 if (src->currentRangeCp > src->lastRangeCp) { |
|
902 src->inRange = FALSE; |
|
903 |
|
904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
|
905 src->isStarred = FALSE; |
|
906 } |
|
907 } else { |
|
908 src->previousCp = src->currentRangeCp; |
|
909 } |
|
910 return src->current; |
|
911 } |
|
912 |
|
913 /* |
|
914 * This function is called several times when a starred list is processed. Each time, the next code point |
|
915 * in the list is processed. |
|
916 * The following variables must be set before calling this function: |
|
917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. |
|
918 * src->lastStarredCharIndex: Index to the last character in the list. |
|
919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. |
|
920 */ |
|
921 static const UChar* |
|
922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src) |
|
923 { |
|
924 // Extract the characters corresponding to the next code point. |
|
925 UChar32 cp; |
|
926 src->parsedToken.charsOffset = src->currentStarredCharIndex; |
|
927 int32_t prev = src->currentStarredCharIndex; |
|
928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); |
|
929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev; |
|
930 |
|
931 // When we are done parsing the starred string, turn the flag off so that |
|
932 // the normal processing is restored. |
|
933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) { |
|
934 src->isStarred = FALSE; |
|
935 } |
|
936 src->previousCp = cp; |
|
937 return src->current; |
|
938 } |
|
939 |
|
940 /* |
|
941 * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. |
|
942 * |
|
943 * This routine parses and separates almost all tokens. The following are the syntax characters recognized. |
|
944 * # : Comment character |
|
945 * & : Reset operator |
|
946 * = : Equality |
|
947 * < : Primary collation |
|
948 * << : Secondary collation |
|
949 * <<< : Tertiary collation |
|
950 * ; : Secondary collation |
|
951 * , : Tertiary collation |
|
952 * / : Expansions |
|
953 * | : Prefix |
|
954 * - : Range |
|
955 |
|
956 * ! : Java Thai modifier, ignored |
|
957 * @ : French only |
|
958 |
|
959 * [] : Options |
|
960 * '' : Quotes |
|
961 * |
|
962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz |
|
963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above. |
|
964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a", |
|
965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, whether in a range and the previous |
|
966 * character returned as cached so that the calling program can do further splitting. |
|
967 */ |
|
968 static const UChar* |
|
969 ucol_tok_parseNextTokenInternal(UColTokenParser *src, |
|
970 UBool startOfRules, |
|
971 UParseError *parseError, |
|
972 UErrorCode *status) |
|
973 { |
|
974 UBool variableTop = FALSE; |
|
975 UBool top = FALSE; |
|
976 UBool inChars = TRUE; |
|
977 UBool inQuote = FALSE; |
|
978 UBool wasInQuote = FALSE; |
|
979 uint8_t before = 0; |
|
980 UBool isEscaped = FALSE; |
|
981 |
|
982 // TODO: replace these variables with src->parsedToken counterparts |
|
983 // no need to use them anymore since we have src->parsedToken. |
|
984 // Ideally, token parser would be a nice class... Once, when I have |
|
985 // more time (around 2020 probably). |
|
986 uint32_t newExtensionLen = 0; |
|
987 uint32_t extensionOffset = 0; |
|
988 uint32_t newStrength = UCOL_TOK_UNSET; |
|
989 UChar buff[10]; |
|
990 |
|
991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; |
|
992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; |
|
993 src->parsedToken.indirectIndex = 0; |
|
994 |
|
995 while (src->current < src->end) { |
|
996 UChar ch = *(src->current); |
|
997 |
|
998 if (inQuote) { |
|
999 if (ch == 0x0027/*'\''*/) { |
|
1000 inQuote = FALSE; |
|
1001 } else { |
|
1002 if ((src->parsedToken.charsLen == 0) || inChars) { |
|
1003 if(src->parsedToken.charsLen == 0) { |
|
1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1005 } |
|
1006 src->parsedToken.charsLen++; |
|
1007 } else { |
|
1008 if(newExtensionLen == 0) { |
|
1009 extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1010 } |
|
1011 newExtensionLen++; |
|
1012 } |
|
1013 } |
|
1014 }else if(isEscaped){ |
|
1015 isEscaped =FALSE; |
|
1016 if (newStrength == UCOL_TOK_UNSET) { |
|
1017 *status = U_INVALID_FORMAT_ERROR; |
|
1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1019 DBG_FORMAT_ERROR |
|
1020 return NULL; |
|
1021 // enabling rules to start with non-tokens a < b |
|
1022 // newStrength = UCOL_TOK_RESET; |
|
1023 } |
|
1024 if(ch != 0x0000 && src->current != src->end) { |
|
1025 if (inChars) { |
|
1026 if(src->parsedToken.charsLen == 0) { |
|
1027 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
|
1028 } |
|
1029 src->parsedToken.charsLen++; |
|
1030 } else { |
|
1031 if(newExtensionLen == 0) { |
|
1032 extensionOffset = (uint32_t)(src->current - src->source); |
|
1033 } |
|
1034 newExtensionLen++; |
|
1035 } |
|
1036 } |
|
1037 }else { |
|
1038 if(!PatternProps::isWhiteSpace(ch)) { |
|
1039 /* Sets the strength for this entry */ |
|
1040 switch (ch) { |
|
1041 case 0x003D/*'='*/ : |
|
1042 if (newStrength != UCOL_TOK_UNSET) { |
|
1043 goto EndOfLoop; |
|
1044 } |
|
1045 |
|
1046 /* if we start with strength, we'll reset to top */ |
|
1047 if(startOfRules == TRUE) { |
|
1048 src->parsedToken.indirectIndex = 5; |
|
1049 top = ucol_tok_doSetTop(src, status); |
|
1050 newStrength = UCOL_TOK_RESET; |
|
1051 goto EndOfLoop; |
|
1052 } |
|
1053 newStrength = UCOL_IDENTICAL; |
|
1054 if(*(src->current+1) == 0x002A) {/*'*'*/ |
|
1055 src->current++; |
|
1056 src->isStarred = TRUE; |
|
1057 } |
|
1058 break; |
|
1059 |
|
1060 case 0x002C/*','*/: |
|
1061 if (newStrength != UCOL_TOK_UNSET) { |
|
1062 goto EndOfLoop; |
|
1063 } |
|
1064 |
|
1065 /* if we start with strength, we'll reset to top */ |
|
1066 if(startOfRules == TRUE) { |
|
1067 src->parsedToken.indirectIndex = 5; |
|
1068 top = ucol_tok_doSetTop(src, status); |
|
1069 newStrength = UCOL_TOK_RESET; |
|
1070 goto EndOfLoop; |
|
1071 } |
|
1072 newStrength = UCOL_TERTIARY; |
|
1073 break; |
|
1074 |
|
1075 case 0x003B/*';'*/: |
|
1076 if (newStrength != UCOL_TOK_UNSET) { |
|
1077 goto EndOfLoop; |
|
1078 } |
|
1079 |
|
1080 /* if we start with strength, we'll reset to top */ |
|
1081 if(startOfRules == TRUE) { |
|
1082 src->parsedToken.indirectIndex = 5; |
|
1083 top = ucol_tok_doSetTop(src, status); |
|
1084 newStrength = UCOL_TOK_RESET; |
|
1085 goto EndOfLoop; |
|
1086 } |
|
1087 newStrength = UCOL_SECONDARY; |
|
1088 break; |
|
1089 |
|
1090 case 0x003C/*'<'*/: |
|
1091 if (newStrength != UCOL_TOK_UNSET) { |
|
1092 goto EndOfLoop; |
|
1093 } |
|
1094 |
|
1095 /* if we start with strength, we'll reset to top */ |
|
1096 if(startOfRules == TRUE) { |
|
1097 src->parsedToken.indirectIndex = 5; |
|
1098 top = ucol_tok_doSetTop(src, status); |
|
1099 newStrength = UCOL_TOK_RESET; |
|
1100 goto EndOfLoop; |
|
1101 } |
|
1102 /* before this, do a scan to verify whether this is */ |
|
1103 /* another strength */ |
|
1104 if(*(src->current+1) == 0x003C) { |
|
1105 src->current++; |
|
1106 if(*(src->current+1) == 0x003C) { |
|
1107 src->current++; /* three in a row! */ |
|
1108 newStrength = UCOL_TERTIARY; |
|
1109 } else { /* two in a row */ |
|
1110 newStrength = UCOL_SECONDARY; |
|
1111 } |
|
1112 } else { /* just one */ |
|
1113 newStrength = UCOL_PRIMARY; |
|
1114 } |
|
1115 if(*(src->current+1) == 0x002A) {/*'*'*/ |
|
1116 src->current++; |
|
1117 src->isStarred = TRUE; |
|
1118 } |
|
1119 break; |
|
1120 |
|
1121 case 0x0026/*'&'*/: |
|
1122 if (newStrength != UCOL_TOK_UNSET) { |
|
1123 /**/ |
|
1124 goto EndOfLoop; |
|
1125 } |
|
1126 |
|
1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
|
1128 break; |
|
1129 |
|
1130 case 0x005b/*'['*/: |
|
1131 /* options - read an option, analyze it */ |
|
1132 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { |
|
1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); |
|
1134 if(U_SUCCESS(*status)) { |
|
1135 if(result & UCOL_TOK_TOP) { |
|
1136 if(newStrength == UCOL_TOK_RESET) { |
|
1137 top = ucol_tok_doSetTop(src, status); |
|
1138 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' |
|
1139 src->parsedToken.charsLen+=2; |
|
1140 buff[0] = 0x002d; |
|
1141 buff[1] = before; |
|
1142 ucol_tok_addToExtraCurrent(src, buff, 2, status); |
|
1143 } |
|
1144 |
|
1145 src->current++; |
|
1146 goto EndOfLoop; |
|
1147 } else { |
|
1148 *status = U_INVALID_FORMAT_ERROR; |
|
1149 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1150 DBG_FORMAT_ERROR |
|
1151 } |
|
1152 } else if(result & UCOL_TOK_VARIABLE_TOP) { |
|
1153 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { |
|
1154 variableTop = TRUE; |
|
1155 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1156 src->parsedToken.charsLen = 1; |
|
1157 buff[0] = 0xFFFF; |
|
1158 ucol_tok_addToExtraCurrent(src, buff, 1, status); |
|
1159 src->current++; |
|
1160 goto EndOfLoop; |
|
1161 } else { |
|
1162 *status = U_INVALID_FORMAT_ERROR; |
|
1163 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1164 DBG_FORMAT_ERROR |
|
1165 } |
|
1166 } else if (result & UCOL_TOK_BEFORE){ |
|
1167 if(newStrength == UCOL_TOK_RESET) { |
|
1168 before = result & UCOL_TOK_BEFORE; |
|
1169 } else { |
|
1170 *status = U_INVALID_FORMAT_ERROR; |
|
1171 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1172 DBG_FORMAT_ERROR |
|
1173 } |
|
1174 } |
|
1175 } else { |
|
1176 *status = U_INVALID_FORMAT_ERROR; |
|
1177 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1178 DBG_FORMAT_ERROR |
|
1179 return NULL; |
|
1180 } |
|
1181 } |
|
1182 break; |
|
1183 case 0x0021/*! skip java thai modifier reordering*/: |
|
1184 break; |
|
1185 case 0x002F/*'/'*/: |
|
1186 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ |
|
1187 inChars = FALSE; /* we're now processing expansion */ |
|
1188 break; |
|
1189 case 0x005C /* back slash for escaped chars */: |
|
1190 isEscaped = TRUE; |
|
1191 break; |
|
1192 /* found a quote, we're gonna start copying */ |
|
1193 case 0x0027/*'\''*/: |
|
1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ |
|
1195 *status = U_INVALID_FORMAT_ERROR; |
|
1196 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1197 DBG_FORMAT_ERROR |
|
1198 return NULL; |
|
1199 // enabling rules to start with a non-token character a < b |
|
1200 // newStrength = UCOL_TOK_RESET; |
|
1201 } |
|
1202 |
|
1203 inQuote = TRUE; |
|
1204 |
|
1205 if(inChars) { /* we're doing characters */ |
|
1206 if(wasInQuote == FALSE) { |
|
1207 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1208 } |
|
1209 if (src->parsedToken.charsLen != 0) { |
|
1210 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
|
1211 } |
|
1212 src->parsedToken.charsLen++; |
|
1213 } else { /* we're doing an expansion */ |
|
1214 if(wasInQuote == FALSE) { |
|
1215 extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1216 } |
|
1217 if (newExtensionLen != 0) { |
|
1218 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); |
|
1219 } |
|
1220 newExtensionLen++; |
|
1221 } |
|
1222 |
|
1223 wasInQuote = TRUE; |
|
1224 |
|
1225 ch = *(++(src->current)); |
|
1226 if(ch == 0x0027) { /* copy the double quote */ |
|
1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
|
1228 inQuote = FALSE; |
|
1229 } |
|
1230 break; |
|
1231 |
|
1232 /* '@' is french only if the strength is not currently set */ |
|
1233 /* if it is, it's just a regular character in collation rules */ |
|
1234 case 0x0040/*'@'*/: |
|
1235 if (newStrength == UCOL_TOK_UNSET) { |
|
1236 src->opts->frenchCollation = UCOL_ON; |
|
1237 break; |
|
1238 } |
|
1239 |
|
1240 case 0x007C /*|*/: /* this means we have actually been reading prefix part */ |
|
1241 // we want to store read characters to the prefix part and continue reading |
|
1242 // the characters (proper way would be to restart reading the chars, but in |
|
1243 // that case we would have to complicate the token hasher, which I do not |
|
1244 // intend to play with. Instead, we will do prefixes when prefixes are due |
|
1245 // (before adding the elements). |
|
1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset; |
|
1247 src->parsedToken.prefixLen = src->parsedToken.charsLen; |
|
1248 |
|
1249 if(inChars) { /* we're doing characters */ |
|
1250 if(wasInQuote == FALSE) { |
|
1251 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1252 } |
|
1253 if (src->parsedToken.charsLen != 0) { |
|
1254 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
|
1255 } |
|
1256 src->parsedToken.charsLen++; |
|
1257 } |
|
1258 |
|
1259 wasInQuote = TRUE; |
|
1260 |
|
1261 do { |
|
1262 ch = *(++(src->current)); |
|
1263 // skip whitespace between '|' and the character |
|
1264 } while (PatternProps::isWhiteSpace(ch)); |
|
1265 break; |
|
1266 |
|
1267 //charsOffset = 0; |
|
1268 //newCharsLen = 0; |
|
1269 //break; // We want to store the whole prefix/character sequence. If we break |
|
1270 // the '|' is going to get lost. |
|
1271 |
|
1272 case 0x002D /*-*/: /* A range. */ |
|
1273 if (newStrength != UCOL_TOK_UNSET) { |
|
1274 // While processing the pending token, the isStarred field |
|
1275 // is reset, so it needs to be saved for the next |
|
1276 // invocation. |
|
1277 src->savedIsStarred = src->isStarred; |
|
1278 goto EndOfLoop; |
|
1279 } |
|
1280 src->isStarred = src->savedIsStarred; |
|
1281 |
|
1282 // Ranges are valid only in starred tokens. |
|
1283 if (!src->isStarred) { |
|
1284 *status = U_INVALID_FORMAT_ERROR; |
|
1285 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1286 DBG_FORMAT_ERROR |
|
1287 return NULL; |
|
1288 } |
|
1289 newStrength = src->parsedToken.strength; |
|
1290 src->inRange = TRUE; |
|
1291 break; |
|
1292 |
|
1293 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ |
|
1294 do { |
|
1295 ch = *(++(src->current)); |
|
1296 } while (!isCharNewLine(ch)); |
|
1297 |
|
1298 break; |
|
1299 default: |
|
1300 if (newStrength == UCOL_TOK_UNSET) { |
|
1301 *status = U_INVALID_FORMAT_ERROR; |
|
1302 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1303 DBG_FORMAT_ERROR |
|
1304 return NULL; |
|
1305 } |
|
1306 |
|
1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
|
1308 *status = U_INVALID_FORMAT_ERROR; |
|
1309 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1310 DBG_FORMAT_ERROR |
|
1311 return NULL; |
|
1312 } |
|
1313 |
|
1314 if(ch == 0x0000 && src->current+1 == src->end) { |
|
1315 break; |
|
1316 } |
|
1317 |
|
1318 if (inChars) { |
|
1319 if(src->parsedToken.charsLen == 0) { |
|
1320 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
|
1321 } |
|
1322 src->parsedToken.charsLen++; |
|
1323 } else { |
|
1324 if(newExtensionLen == 0) { |
|
1325 extensionOffset = (uint32_t)(src->current - src->source); |
|
1326 } |
|
1327 newExtensionLen++; |
|
1328 } |
|
1329 |
|
1330 break; |
|
1331 } |
|
1332 } |
|
1333 } |
|
1334 |
|
1335 if(wasInQuote) { |
|
1336 if(ch != 0x27) { |
|
1337 if(inQuote || !PatternProps::isWhiteSpace(ch)) { |
|
1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
|
1339 } |
|
1340 } |
|
1341 } |
|
1342 |
|
1343 src->current++; |
|
1344 } |
|
1345 |
|
1346 EndOfLoop: |
|
1347 wasInQuote = FALSE; |
|
1348 if (newStrength == UCOL_TOK_UNSET) { |
|
1349 return NULL; |
|
1350 } |
|
1351 |
|
1352 if (src->parsedToken.charsLen == 0 && top == FALSE) { |
|
1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
|
1354 *status = U_INVALID_FORMAT_ERROR; |
|
1355 DBG_FORMAT_ERROR |
|
1356 return NULL; |
|
1357 } |
|
1358 |
|
1359 src->parsedToken.strength = newStrength; |
|
1360 src->parsedToken.extensionOffset = extensionOffset; |
|
1361 src->parsedToken.extensionLen = newExtensionLen; |
|
1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; |
|
1363 |
|
1364 return src->current; |
|
1365 } |
|
1366 |
|
1367 /* |
|
1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. |
|
1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. |
|
1370 * |
|
1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: |
|
1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates |
|
1373 * it to separate tokens and returns one by one. In order to do that, the necessary states are |
|
1374 * cached as member variables of the token parser. |
|
1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the |
|
1376 * starting character as a single list token (which is separated into individual characters here) |
|
1377 * and as another list token starting with the last character in the range. Before expanding it |
|
1378 * as a list of tokens, this function expands the range by filling the intermediate characters and |
|
1379 * returns them one by one as separate tokens. |
|
1380 * Necessary checks are done for invalid combinations. |
|
1381 */ |
|
1382 U_CAPI const UChar* U_EXPORT2 |
|
1383 ucol_tok_parseNextToken(UColTokenParser *src, |
|
1384 UBool startOfRules, |
|
1385 UParseError *parseError, |
|
1386 UErrorCode *status) |
|
1387 { |
|
1388 const UChar *nextToken; |
|
1389 |
|
1390 if (src->inRange) { |
|
1391 // We are not done processing a range. Continue it. |
|
1392 return ucol_tok_processNextCodePointInRange(src, status); |
|
1393 } else if (src->isStarred) { |
|
1394 // We are not done processing a starred token. Continue it. |
|
1395 return ucol_tok_processNextTokenInStarredList(src); |
|
1396 } |
|
1397 |
|
1398 // Get the next token. |
|
1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); |
|
1400 |
|
1401 if (nextToken == NULL) { |
|
1402 return NULL; |
|
1403 } |
|
1404 |
|
1405 if (src->inRange) { |
|
1406 // A new range has started. |
|
1407 // Check whether it is a chain of ranges with more than one hyphen. |
|
1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { |
|
1409 *status = U_INVALID_FORMAT_ERROR; |
|
1410 syntaxError(src->source,src->parsedToken.charsOffset-1, |
|
1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); |
|
1412 DBG_FORMAT_ERROR |
|
1413 return NULL; |
|
1414 } |
|
1415 |
|
1416 // The current token indicates the second code point of the range. |
|
1417 // Process just that, and then proceed with the star. |
|
1418 src->currentStarredCharIndex = src->parsedToken.charsOffset; |
|
1419 U16_NEXT(src->source, src->currentStarredCharIndex, |
|
1420 (uint32_t)(src->end - src->source), src->lastRangeCp); |
|
1421 if (src->lastRangeCp <= src->previousCp) { |
|
1422 *status = U_INVALID_FORMAT_ERROR; |
|
1423 syntaxError(src->source,src->parsedToken.charsOffset-1, |
|
1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); |
|
1425 DBG_FORMAT_ERROR |
|
1426 return NULL; |
|
1427 } |
|
1428 |
|
1429 // Set current range code point to process the range loop |
|
1430 src->currentRangeCp = src->previousCp + 1; |
|
1431 |
|
1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; |
|
1433 |
|
1434 return ucol_tok_processNextCodePointInRange(src, status); |
|
1435 } else if (src->isStarred) { |
|
1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that |
|
1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be |
|
1438 // separated into several tokens and returned. |
|
1439 src->currentStarredCharIndex = src->parsedToken.charsOffset; |
|
1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; |
|
1441 |
|
1442 return ucol_tok_processNextTokenInStarredList(src); |
|
1443 } else { |
|
1444 // Set previous codepoint |
|
1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); |
|
1446 } |
|
1447 return nextToken; |
|
1448 } |
|
1449 |
|
1450 |
|
1451 /* |
|
1452 Processing Description |
|
1453 1 Build a ListList. Each list has a header, which contains two lists (positive |
|
1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and |
|
1455 reset may be null. |
|
1456 2 As you process, you keep a LAST pointer that points to the last token you |
|
1457 handled. |
|
1458 |
|
1459 */ |
|
1460 |
|
1461 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, |
|
1462 UParseError *parseError, UErrorCode *status) |
|
1463 { |
|
1464 if(src->resultLen == src->listCapacity) { |
|
1465 // Unfortunately, this won't work, as we store addresses of lhs in token |
|
1466 src->listCapacity *= 2; |
|
1467 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); |
|
1468 if(src->lh == NULL) { |
|
1469 *status = U_MEMORY_ALLOCATION_ERROR; |
|
1470 return NULL; |
|
1471 } |
|
1472 } |
|
1473 /* do the reset thing */ |
|
1474 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
|
1475 /* test for NULL */ |
|
1476 if (sourceToken == NULL) { |
|
1477 *status = U_MEMORY_ALLOCATION_ERROR; |
|
1478 return NULL; |
|
1479 } |
|
1480 sourceToken->rulesToParseHdl = &(src->source); |
|
1481 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
|
1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
|
1483 |
|
1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
|
1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
|
1486 |
|
1487 // keep the flags around so that we know about before |
|
1488 sourceToken->flags = src->parsedToken.flags; |
|
1489 |
|
1490 if(src->parsedToken.prefixOffset != 0) { |
|
1491 // this is a syntax error |
|
1492 *status = U_INVALID_FORMAT_ERROR; |
|
1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); |
|
1494 DBG_FORMAT_ERROR |
|
1495 uprv_free(sourceToken); |
|
1496 return 0; |
|
1497 } else { |
|
1498 sourceToken->prefix = 0; |
|
1499 } |
|
1500 |
|
1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
|
1502 sourceToken->strength = UCOL_TOK_RESET; |
|
1503 sourceToken->next = NULL; |
|
1504 sourceToken->previous = NULL; |
|
1505 sourceToken->noOfCEs = 0; |
|
1506 sourceToken->noOfExpCEs = 0; |
|
1507 sourceToken->listHeader = &src->lh[src->resultLen]; |
|
1508 |
|
1509 src->lh[src->resultLen].first = NULL; |
|
1510 src->lh[src->resultLen].last = NULL; |
|
1511 src->lh[src->resultLen].first = NULL; |
|
1512 src->lh[src->resultLen].last = NULL; |
|
1513 |
|
1514 src->lh[src->resultLen].reset = sourceToken; |
|
1515 |
|
1516 /* |
|
1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
|
1518 First convert all expansions into normal form. Examples: |
|
1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
|
1520 d * ... into &x * c/y * d * ... |
|
1521 Note: reset values can never have expansions, although they can cause the |
|
1522 very next item to have one. They may be contractions, if they are found |
|
1523 earlier in the list. |
|
1524 */ |
|
1525 *expandNext = 0; |
|
1526 if(expand != NULL) { |
|
1527 /* check to see if there is an expansion */ |
|
1528 if(src->parsedToken.charsLen > 1) { |
|
1529 uint32_t resetCharsOffset; |
|
1530 resetCharsOffset = (uint32_t)(expand - src->source); |
|
1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; |
|
1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); |
|
1533 } |
|
1534 } |
|
1535 |
|
1536 src->resultLen++; |
|
1537 |
|
1538 uhash_put(src->tailored, sourceToken, sourceToken, status); |
|
1539 |
|
1540 return sourceToken; |
|
1541 } |
|
1542 |
|
1543 static |
|
1544 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { |
|
1545 if(U_FAILURE(*status)) { |
|
1546 return NULL; |
|
1547 } |
|
1548 /* this is a virgin before - we need to fish the anchor from the UCA */ |
|
1549 collIterate s; |
|
1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; |
|
1551 uint32_t CE, SecondCE; |
|
1552 // uint32_t invPos; |
|
1553 if(sourceToken != NULL) { |
|
1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); |
|
1555 } else { |
|
1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); |
|
1557 } |
|
1558 if(U_FAILURE(*status)) { |
|
1559 return NULL; |
|
1560 } |
|
1561 |
|
1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; |
|
1563 baseContCE = ucol_getNextCE(src->UCA, &s, status); |
|
1564 if(baseContCE == UCOL_NO_MORE_CES) { |
|
1565 baseContCE = 0; |
|
1566 } |
|
1567 |
|
1568 |
|
1569 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
|
1570 uint32_t ch = 0; |
|
1571 uint32_t expandNext = 0; |
|
1572 UColToken key; |
|
1573 |
|
1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
|
1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); |
|
1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
|
1577 ch = uprv_uca_getCodePointFromRaw(raw-1); |
|
1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
|
1579 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
|
1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; |
|
1581 |
|
1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
|
1583 *src->extraCurrent++ = 0xFFFE; |
|
1584 *src->extraCurrent++ = (UChar)ch; |
|
1585 src->parsedToken.charsLen++; |
|
1586 |
|
1587 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
|
1588 key.rulesToParseHdl = &(src->source); |
|
1589 |
|
1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
|
1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
|
1592 |
|
1593 if(sourceToken == NULL) { |
|
1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
|
1595 if(isContinuation(SecondCE)) { |
|
1596 src->lh[src->resultLen].baseContCE = SecondCE; |
|
1597 } else { |
|
1598 src->lh[src->resultLen].baseContCE = 0; |
|
1599 } |
|
1600 src->lh[src->resultLen].nextCE = 0; |
|
1601 src->lh[src->resultLen].nextContCE = 0; |
|
1602 src->lh[src->resultLen].previousCE = 0; |
|
1603 src->lh[src->resultLen].previousContCE = 0; |
|
1604 |
|
1605 src->lh[src->resultLen].indirect = FALSE; |
|
1606 |
|
1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
|
1608 } |
|
1609 |
|
1610 } else { |
|
1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
|
1612 |
|
1613 // we got the previous CE. Now we need to see if the difference between |
|
1614 // the two CEs is really of the requested strength. |
|
1615 // if it's a bigger difference (we asked for secondary and got primary), we |
|
1616 // need to modify the CE. |
|
1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { |
|
1618 // adjust the strength |
|
1619 // now we are in the situation where our baseCE should actually be modified in |
|
1620 // order to get the CE in the right position. |
|
1621 if(strength == UCOL_SECONDARY) { |
|
1622 CE = baseCE - 0x0200; |
|
1623 } else { // strength == UCOL_TERTIARY |
|
1624 CE = baseCE - 0x02; |
|
1625 } |
|
1626 if(baseContCE) { |
|
1627 if(strength == UCOL_SECONDARY) { |
|
1628 SecondCE = baseContCE - 0x0200; |
|
1629 } else { // strength == UCOL_TERTIARY |
|
1630 SecondCE = baseContCE - 0x02; |
|
1631 } |
|
1632 } |
|
1633 } |
|
1634 |
|
1635 #if 0 |
|
1636 // the code below relies on getting a code point from the inverse table, in order to be |
|
1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: |
|
1638 // 1. There are many code points that have the same CE |
|
1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. |
|
1640 // Also, in case when there is no equivalent strength before an element, we have to actually |
|
1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element |
|
1642 // before a is a primary difference. |
|
1643 |
|
1644 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
1645 |
|
1646 |
|
1647 ch = CETable[3*invPos+2]; |
|
1648 |
|
1649 if((ch & UCOL_INV_SIZEMASK) != 0) { |
|
1650 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); |
|
1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK); |
|
1652 ch = conts[offset]; |
|
1653 } |
|
1654 |
|
1655 *src->extraCurrent++ = (UChar)ch; |
|
1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); |
|
1657 src->parsedToken.charsLen = 1; |
|
1658 |
|
1659 // We got an UCA before. However, this might have been tailored. |
|
1660 // example: |
|
1661 // &\u30ca = \u306a |
|
1662 // &[before 3]\u306a<<<\u306a|\u309d |
|
1663 |
|
1664 |
|
1665 // uint32_t key = (*newCharsLen << 24) | *charsOffset; |
|
1666 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
|
1667 key.rulesToParseHdl = &(src->source); |
|
1668 |
|
1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
|
1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
|
1671 #endif |
|
1672 |
|
1673 // here is how it should be. The situation such as &[before 1]a < x, should be |
|
1674 // resolved exactly as if we wrote &a > x. |
|
1675 // therefore, I don't really care if the UCA value before a has been changed. |
|
1676 // However, I do care if the strength between my element and the previous element |
|
1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll |
|
1678 // have to construct the base CE. |
|
1679 |
|
1680 |
|
1681 |
|
1682 // if we found a tailored thing, we have to use the UCA value and construct |
|
1683 // a new reset token with constructed name |
|
1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
|
1685 // character to which we want to anchor is already tailored. |
|
1686 // We need to construct a new token which will be the anchor |
|
1687 // point |
|
1688 //*(src->extraCurrent-1) = 0xFFFE; |
|
1689 //*src->extraCurrent++ = (UChar)ch; |
|
1690 // grab before |
|
1691 src->parsedToken.charsOffset -= 10; |
|
1692 src->parsedToken.charsLen += 10; |
|
1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
|
1694 if(isContinuation(SecondCE)) { |
|
1695 src->lh[src->resultLen].baseContCE = SecondCE; |
|
1696 } else { |
|
1697 src->lh[src->resultLen].baseContCE = 0; |
|
1698 } |
|
1699 src->lh[src->resultLen].nextCE = 0; |
|
1700 src->lh[src->resultLen].nextContCE = 0; |
|
1701 src->lh[src->resultLen].previousCE = 0; |
|
1702 src->lh[src->resultLen].previousContCE = 0; |
|
1703 |
|
1704 src->lh[src->resultLen].indirect = FALSE; |
|
1705 |
|
1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
|
1707 //} |
|
1708 } |
|
1709 |
|
1710 return sourceToken; |
|
1711 |
|
1712 } |
|
1713 |
|
1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { |
|
1715 UColToken *lastToken = NULL; |
|
1716 const UChar *parseEnd = NULL; |
|
1717 uint32_t expandNext = 0; |
|
1718 UBool variableTop = FALSE; |
|
1719 UBool top = FALSE; |
|
1720 uint16_t specs = 0; |
|
1721 UColTokListHeader *ListList = NULL; |
|
1722 |
|
1723 src->parsedToken.strength = UCOL_TOK_UNSET; |
|
1724 |
|
1725 ListList = src->lh; |
|
1726 |
|
1727 if(U_FAILURE(*status)) { |
|
1728 return 0; |
|
1729 } |
|
1730 #ifdef DEBUG_FOR_CODE_POINTS |
|
1731 char filename[35]; |
|
1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); |
|
1733 dfcp_fp = fopen(filename, "a"); |
|
1734 fprintf(stdout, "Output is in the file %s.\n", filename); |
|
1735 #endif |
|
1736 |
|
1737 #ifdef DEBUG_FOR_COLL_RULES |
|
1738 std::string s3; |
|
1739 UnicodeString(src->source).toUTF8String(s3); |
|
1740 std::cout << "src->source = " << s3 << std::endl; |
|
1741 #endif |
|
1742 |
|
1743 while(src->current < src->end || src->isStarred) { |
|
1744 src->parsedToken.prefixOffset = 0; |
|
1745 |
|
1746 parseEnd = ucol_tok_parseNextToken(src, |
|
1747 (UBool)(lastToken == NULL), |
|
1748 parseError, |
|
1749 status); |
|
1750 |
|
1751 specs = src->parsedToken.flags; |
|
1752 |
|
1753 |
|
1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
|
1755 top = ((specs & UCOL_TOK_TOP) != 0); |
|
1756 |
|
1757 if(U_SUCCESS(*status) && parseEnd != NULL) { |
|
1758 UColToken *sourceToken = NULL; |
|
1759 //uint32_t key = 0; |
|
1760 uint32_t lastStrength = UCOL_TOK_UNSET; |
|
1761 |
|
1762 if(lastToken != NULL ) { |
|
1763 lastStrength = lastToken->strength; |
|
1764 } |
|
1765 |
|
1766 #ifdef DEBUG_FOR_CODE_POINTS |
|
1767 UChar32 cp; |
|
1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); |
|
1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); |
|
1770 #endif |
|
1771 //key = newCharsLen << 24 | charsOffset; |
|
1772 UColToken key; |
|
1773 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
|
1774 key.rulesToParseHdl = &(src->source); |
|
1775 |
|
1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ |
|
1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
|
1778 |
|
1779 if(src->parsedToken.strength != UCOL_TOK_RESET) { |
|
1780 if(lastToken == NULL) { /* this means that rules haven't started properly */ |
|
1781 *status = U_INVALID_FORMAT_ERROR; |
|
1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
|
1783 DBG_FORMAT_ERROR |
|
1784 return 0; |
|
1785 } |
|
1786 /* 6 Otherwise (when relation != reset) */ |
|
1787 if(sourceToken == NULL) { |
|
1788 /* If sourceToken is null, create new one, */ |
|
1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
|
1790 /* test for NULL */ |
|
1791 if (sourceToken == NULL) { |
|
1792 *status = U_MEMORY_ALLOCATION_ERROR; |
|
1793 return 0; |
|
1794 } |
|
1795 sourceToken->rulesToParseHdl = &(src->source); |
|
1796 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
|
1797 |
|
1798 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
|
1799 |
|
1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; |
|
1801 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); |
|
1802 |
|
1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
|
1804 sourceToken->next = NULL; |
|
1805 sourceToken->previous = NULL; |
|
1806 sourceToken->noOfCEs = 0; |
|
1807 sourceToken->noOfExpCEs = 0; |
|
1808 // keep the flags around so that we know about before |
|
1809 sourceToken->flags = src->parsedToken.flags; |
|
1810 uhash_put(src->tailored, sourceToken, sourceToken, status); |
|
1811 if(U_FAILURE(*status)) { |
|
1812 return 0; |
|
1813 } |
|
1814 } else { |
|
1815 /* we could have fished out a reset here */ |
|
1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { |
|
1817 /* otherwise remove sourceToken from where it was. */ |
|
1818 if(sourceToken->next != NULL) { |
|
1819 if(sourceToken->next->strength > sourceToken->strength) { |
|
1820 sourceToken->next->strength = sourceToken->strength; |
|
1821 } |
|
1822 sourceToken->next->previous = sourceToken->previous; |
|
1823 } else { |
|
1824 sourceToken->listHeader->last = sourceToken->previous; |
|
1825 } |
|
1826 |
|
1827 if(sourceToken->previous != NULL) { |
|
1828 sourceToken->previous->next = sourceToken->next; |
|
1829 } else { |
|
1830 sourceToken->listHeader->first = sourceToken->next; |
|
1831 } |
|
1832 sourceToken->next = NULL; |
|
1833 sourceToken->previous = NULL; |
|
1834 } |
|
1835 } |
|
1836 |
|
1837 sourceToken->strength = src->parsedToken.strength; |
|
1838 sourceToken->listHeader = lastToken->listHeader; |
|
1839 |
|
1840 /* |
|
1841 1. Find the strongest strength in each list, and set strongestP and strongestN |
|
1842 accordingly in the headers. |
|
1843 */ |
|
1844 if(lastStrength == UCOL_TOK_RESET |
|
1845 || sourceToken->listHeader->first == 0) { |
|
1846 /* If LAST is a reset |
|
1847 insert sourceToken in the list. */ |
|
1848 if(sourceToken->listHeader->first == 0) { |
|
1849 sourceToken->listHeader->first = sourceToken; |
|
1850 sourceToken->listHeader->last = sourceToken; |
|
1851 } else { /* we need to find a place for us */ |
|
1852 /* and we'll get in front of the same strength */ |
|
1853 if(sourceToken->listHeader->first->strength <= sourceToken->strength) { |
|
1854 sourceToken->next = sourceToken->listHeader->first; |
|
1855 sourceToken->next->previous = sourceToken; |
|
1856 sourceToken->listHeader->first = sourceToken; |
|
1857 sourceToken->previous = NULL; |
|
1858 } else { |
|
1859 lastToken = sourceToken->listHeader->first; |
|
1860 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
|
1861 lastToken = lastToken->next; |
|
1862 } |
|
1863 if(lastToken->next != NULL) { |
|
1864 lastToken->next->previous = sourceToken; |
|
1865 } else { |
|
1866 sourceToken->listHeader->last = sourceToken; |
|
1867 } |
|
1868 sourceToken->previous = lastToken; |
|
1869 sourceToken->next = lastToken->next; |
|
1870 lastToken->next = sourceToken; |
|
1871 } |
|
1872 } |
|
1873 } else { |
|
1874 /* Otherwise (when LAST is not a reset) |
|
1875 if polarity (LAST) == polarity(relation), insert sourceToken after LAST, |
|
1876 otherwise insert before. |
|
1877 when inserting after or before, search to the next position with the same |
|
1878 strength in that direction. (This is called postpone insertion). */ |
|
1879 if(sourceToken != lastToken) { |
|
1880 if(lastToken->polarity == sourceToken->polarity) { |
|
1881 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
|
1882 lastToken = lastToken->next; |
|
1883 } |
|
1884 sourceToken->previous = lastToken; |
|
1885 if(lastToken->next != NULL) { |
|
1886 lastToken->next->previous = sourceToken; |
|
1887 } else { |
|
1888 sourceToken->listHeader->last = sourceToken; |
|
1889 } |
|
1890 |
|
1891 sourceToken->next = lastToken->next; |
|
1892 lastToken->next = sourceToken; |
|
1893 } else { |
|
1894 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { |
|
1895 lastToken = lastToken->previous; |
|
1896 } |
|
1897 sourceToken->next = lastToken; |
|
1898 if(lastToken->previous != NULL) { |
|
1899 lastToken->previous->next = sourceToken; |
|
1900 } else { |
|
1901 sourceToken->listHeader->first = sourceToken; |
|
1902 } |
|
1903 sourceToken->previous = lastToken->previous; |
|
1904 lastToken->previous = sourceToken; |
|
1905 } |
|
1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */ |
|
1907 if(lastStrength < sourceToken->strength) { |
|
1908 sourceToken->strength = lastStrength; |
|
1909 } |
|
1910 } |
|
1911 } |
|
1912 |
|
1913 /* if the token was a variable top, we're gonna put it in */ |
|
1914 if(variableTop == TRUE && src->varTop == NULL) { |
|
1915 variableTop = FALSE; |
|
1916 src->varTop = sourceToken; |
|
1917 } |
|
1918 |
|
1919 // Treat the expansions. |
|
1920 // There are two types of expansions: explicit (x / y) and reset based propagating expansions |
|
1921 // (&abc * d * e <=> &ab * d / c * e / c) |
|
1922 // if both of them are in effect for a token, they are combined. |
|
1923 |
|
1924 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
|
1925 |
|
1926 if(expandNext != 0) { |
|
1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ |
|
1928 expandNext = 0; |
|
1929 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ |
|
1930 sourceToken->expansion = expandNext; |
|
1931 } else { /* there is both explicit and implicit expansion. We need to make a combination */ |
|
1932 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); |
|
1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); |
|
1934 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); |
|
1935 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; |
|
1936 } |
|
1937 } |
|
1938 |
|
1939 // This is just for debugging purposes |
|
1940 if(sourceToken->expansion != 0) { |
|
1941 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
|
1942 } else { |
|
1943 sourceToken->debugExpansion = 0; |
|
1944 } |
|
1945 // if the previous token was a reset before, the strength of this |
|
1946 // token must match the strength of before. Otherwise we have an |
|
1947 // undefined situation. |
|
1948 // In other words, we currently have a cludge which we use to |
|
1949 // represent &a >> x. This is written as &[before 2]a << x. |
|
1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { |
|
1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; |
|
1952 if(beforeStrength != sourceToken->strength) { |
|
1953 *status = U_INVALID_FORMAT_ERROR; |
|
1954 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
|
1955 DBG_FORMAT_ERROR |
|
1956 return 0; |
|
1957 } |
|
1958 } |
|
1959 } else { |
|
1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
|
1961 /* if the previous token was also a reset, */ |
|
1962 /*this means that we have two consecutive resets */ |
|
1963 /* and we want to remove the previous one if empty*/ |
|
1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
|
1965 src->resultLen--; |
|
1966 } |
|
1967 } |
|
1968 |
|
1969 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ |
|
1970 uint32_t searchCharsLen = src->parsedToken.charsLen; |
|
1971 while(searchCharsLen > 1 && sourceToken == NULL) { |
|
1972 searchCharsLen--; |
|
1973 //key = searchCharsLen << 24 | charsOffset; |
|
1974 UColToken key; |
|
1975 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; |
|
1976 key.rulesToParseHdl = &(src->source); |
|
1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
|
1978 } |
|
1979 if(sourceToken != NULL) { |
|
1980 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); |
|
1981 } |
|
1982 } |
|
1983 |
|
1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
|
1985 if(top == FALSE) { /* there is no indirection */ |
|
1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
|
1987 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
|
1988 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ |
|
1989 while(sourceToken->strength > strength && sourceToken->previous != NULL) { |
|
1990 sourceToken = sourceToken->previous; |
|
1991 } |
|
1992 /* here, either we hit the strength or NULL */ |
|
1993 if(sourceToken->strength == strength) { |
|
1994 if(sourceToken->previous != NULL) { |
|
1995 sourceToken = sourceToken->previous; |
|
1996 } else { /* start of list */ |
|
1997 sourceToken = sourceToken->listHeader->reset; |
|
1998 } |
|
1999 } else { /* we hit NULL */ |
|
2000 /* we should be doing the else part */ |
|
2001 sourceToken = sourceToken->listHeader->reset; |
|
2002 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
|
2003 } |
|
2004 } else { |
|
2005 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
|
2006 } |
|
2007 } else { /* this is both before and indirection */ |
|
2008 top = FALSE; |
|
2009 ListList[src->resultLen].previousCE = 0; |
|
2010 ListList[src->resultLen].previousContCE = 0; |
|
2011 ListList[src->resultLen].indirect = TRUE; |
|
2012 /* we need to do slightly more work. we need to get the baseCE using the */ |
|
2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ |
|
2014 /* in ucol_bld */ |
|
2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
|
2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
|
2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; |
|
2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
|
2019 |
|
2020 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
|
2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && |
|
2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
|
2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); |
|
2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
|
2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
|
2026 CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
|
2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; |
|
2028 } else { |
|
2029 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ |
|
2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
|
2031 } |
|
2032 |
|
2033 ListList[src->resultLen].baseCE = CE; |
|
2034 ListList[src->resultLen].baseContCE = SecondCE; |
|
2035 ListList[src->resultLen].nextCE = 0; |
|
2036 ListList[src->resultLen].nextContCE = 0; |
|
2037 |
|
2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
|
2039 } |
|
2040 } |
|
2041 |
|
2042 |
|
2043 /* 5 If the relation is a reset: |
|
2044 If sourceToken is null |
|
2045 Create new list, create new sourceToken, make the baseCE from source, put |
|
2046 the sourceToken in ListHeader of the new list */ |
|
2047 if(sourceToken == NULL) { |
|
2048 /* |
|
2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
|
2050 First convert all expansions into normal form. Examples: |
|
2051 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
|
2052 d * ... into &x * c/y * d * ... |
|
2053 Note: reset values can never have expansions, although they can cause the |
|
2054 very next item to have one. They may be contractions, if they are found |
|
2055 earlier in the list. |
|
2056 */ |
|
2057 if(top == FALSE) { |
|
2058 collIterate s; |
|
2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
|
2060 |
|
2061 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); |
|
2062 |
|
2063 CE = ucol_getNextCE(src->UCA, &s, status); |
|
2064 const UChar *expand = s.pos; |
|
2065 SecondCE = ucol_getNextCE(src->UCA, &s, status); |
|
2066 |
|
2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
|
2068 if(isContinuation(SecondCE)) { |
|
2069 ListList[src->resultLen].baseContCE = SecondCE; |
|
2070 } else { |
|
2071 ListList[src->resultLen].baseContCE = 0; |
|
2072 } |
|
2073 ListList[src->resultLen].nextCE = 0; |
|
2074 ListList[src->resultLen].nextContCE = 0; |
|
2075 ListList[src->resultLen].previousCE = 0; |
|
2076 ListList[src->resultLen].previousContCE = 0; |
|
2077 ListList[src->resultLen].indirect = FALSE; |
|
2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); |
|
2079 } else { /* top == TRUE */ |
|
2080 /* just use the supplied values */ |
|
2081 top = FALSE; |
|
2082 ListList[src->resultLen].previousCE = 0; |
|
2083 ListList[src->resultLen].previousContCE = 0; |
|
2084 ListList[src->resultLen].indirect = TRUE; |
|
2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
|
2086 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; |
|
2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; |
|
2088 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; |
|
2089 |
|
2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
|
2091 |
|
2092 } |
|
2093 } else { /* reset to something already in rules */ |
|
2094 top = FALSE; |
|
2095 } |
|
2096 } |
|
2097 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
|
2098 lastToken = sourceToken; |
|
2099 } else { |
|
2100 if(U_FAILURE(*status)) { |
|
2101 return 0; |
|
2102 } |
|
2103 } |
|
2104 } |
|
2105 #ifdef DEBUG_FOR_CODE_POINTS |
|
2106 fclose(dfcp_fp); |
|
2107 #endif |
|
2108 |
|
2109 |
|
2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
|
2111 src->resultLen--; |
|
2112 } |
|
2113 return src->resultLen; |
|
2114 } |
|
2115 |
|
2116 const UChar* ucol_tok_getRulesFromBundle( |
|
2117 void* /*context*/, |
|
2118 const char* locale, |
|
2119 const char* type, |
|
2120 int32_t* pLength, |
|
2121 UErrorCode* status) |
|
2122 { |
|
2123 const UChar* rules = NULL; |
|
2124 UResourceBundle* bundle; |
|
2125 UResourceBundle* collations; |
|
2126 UResourceBundle* collation; |
|
2127 |
|
2128 *pLength = 0; |
|
2129 |
|
2130 bundle = ures_open(U_ICUDATA_COLL, locale, status); |
|
2131 if(U_SUCCESS(*status)){ |
|
2132 collations = ures_getByKey(bundle, "collations", NULL, status); |
|
2133 if(U_SUCCESS(*status)){ |
|
2134 collation = ures_getByKey(collations, type, NULL, status); |
|
2135 if(U_SUCCESS(*status)){ |
|
2136 rules = ures_getStringByKey(collation, "Sequence", pLength, status); |
|
2137 if(U_FAILURE(*status)){ |
|
2138 *pLength = 0; |
|
2139 rules = NULL; |
|
2140 } |
|
2141 ures_close(collation); |
|
2142 } |
|
2143 ures_close(collations); |
|
2144 } |
|
2145 } |
|
2146 |
|
2147 ures_close(bundle); |
|
2148 |
|
2149 return rules; |
|
2150 } |
|
2151 |
|
2152 void ucol_tok_initTokenList( |
|
2153 UColTokenParser *src, |
|
2154 const UChar *rules, |
|
2155 uint32_t rulesLength, |
|
2156 const UCollator *UCA, |
|
2157 GetCollationRulesFunction importFunc, |
|
2158 void* context, |
|
2159 UErrorCode *status) { |
|
2160 U_NAMESPACE_USE |
|
2161 |
|
2162 uint32_t nSize = 0; |
|
2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); |
|
2164 |
|
2165 bool needToDeallocRules = false; |
|
2166 |
|
2167 if(U_FAILURE(*status)) { |
|
2168 return; |
|
2169 } |
|
2170 |
|
2171 // set everything to zero, so that we can clean up gracefully |
|
2172 uprv_memset(src, 0, sizeof(UColTokenParser)); |
|
2173 |
|
2174 // first we need to find options that don't like to be normalized, |
|
2175 // like copy and remove... |
|
2176 //const UChar *openBrace = rules; |
|
2177 int32_t optionNumber = -1; |
|
2178 const UChar *setStart = NULL; |
|
2179 uint32_t i = 0; |
|
2180 while(i < rulesLength) { |
|
2181 if(rules[i] == 0x005B) { // '[': start of an option |
|
2182 /* Gets the following: |
|
2183 optionNumber: The index of the option. |
|
2184 setStart: The pointer at which the option arguments start. |
|
2185 */ |
|
2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); |
|
2187 |
|
2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ |
|
2189 // [optimize] |
|
2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
|
2191 if(U_SUCCESS(*status)) { |
|
2192 if(src->copySet == NULL) { |
|
2193 src->copySet = newSet; |
|
2194 } else { |
|
2195 uset_addAll(src->copySet, newSet); |
|
2196 uset_close(newSet); |
|
2197 } |
|
2198 } else { |
|
2199 return; |
|
2200 } |
|
2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { |
|
2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
|
2203 if(U_SUCCESS(*status)) { |
|
2204 if(src->removeSet == NULL) { |
|
2205 src->removeSet = newSet; |
|
2206 } else { |
|
2207 uset_addAll(src->removeSet, newSet); |
|
2208 uset_close(newSet); |
|
2209 } |
|
2210 } else { |
|
2211 return; |
|
2212 } |
|
2213 } else if(optionNumber == OPTION_IMPORT){ |
|
2214 // [import <collation-name>] |
|
2215 |
|
2216 // Find the address of the closing ]. |
|
2217 UChar* import_end = u_strchr(setStart, 0x005D); |
|
2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); |
|
2219 // Ignore trailing whitespace. |
|
2220 while(PatternProps::isWhiteSpace(*(import_end-1))) { |
|
2221 --import_end; |
|
2222 } |
|
2223 |
|
2224 int32_t optionLength = (int32_t)(import_end - setStart); |
|
2225 char option[50]; |
|
2226 if(optionLength >= (int32_t)sizeof(option)) { |
|
2227 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
2228 return; |
|
2229 } |
|
2230 u_UCharsToChars(setStart, option, optionLength); |
|
2231 option[optionLength] = 0; |
|
2232 |
|
2233 *status = U_ZERO_ERROR; |
|
2234 char locale[50]; |
|
2235 int32_t templ; |
|
2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); |
|
2237 if(U_FAILURE(*status)) { |
|
2238 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
2239 return; |
|
2240 } |
|
2241 |
|
2242 char type[50]; |
|
2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || |
|
2244 U_FAILURE(*status) |
|
2245 ) { |
|
2246 *status = U_ZERO_ERROR; |
|
2247 uprv_strcpy(type, "standard"); |
|
2248 } |
|
2249 |
|
2250 // TODO: Use public functions when available, see ticket #8134. |
|
2251 char *keywords = (char *)locale_getKeywordsStart(locale); |
|
2252 if(keywords != NULL) { |
|
2253 *keywords = 0; |
|
2254 } |
|
2255 |
|
2256 int32_t importRulesLength = 0; |
|
2257 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); |
|
2258 |
|
2259 #ifdef DEBUG_FOR_COLL_RULES |
|
2260 std::string s; |
|
2261 UnicodeString(importRules).toUTF8String(s); |
|
2262 std::cout << "Import rules = " << s << std::endl; |
|
2263 #endif |
|
2264 |
|
2265 // Add the length of the imported rules to length of the original rules, |
|
2266 // and subtract the length of the import option. |
|
2267 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); |
|
2268 |
|
2269 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); |
|
2270 |
|
2271 #ifdef DEBUG_FOR_COLL_RULES |
|
2272 std::string s1; |
|
2273 UnicodeString(rules).toUTF8String(s1); |
|
2274 std::cout << "Original rules = " << s1 << std::endl; |
|
2275 #endif |
|
2276 |
|
2277 |
|
2278 // Copy the section of the original rules leading up to the import |
|
2279 uprv_memcpy(newRules, rules, i*sizeof(UChar)); |
|
2280 // Copy the imported rules |
|
2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); |
|
2282 // Copy the rest of the original rules (minus the import option itself) |
|
2283 uprv_memcpy(newRules+i+importRulesLength, |
|
2284 rules+optionEndOffset, |
|
2285 (rulesLength-optionEndOffset)*sizeof(UChar)); |
|
2286 |
|
2287 #ifdef DEBUG_FOR_COLL_RULES |
|
2288 std::string s2; |
|
2289 UnicodeString(newRules).toUTF8String(s2); |
|
2290 std::cout << "Resulting rules = " << s2 << std::endl; |
|
2291 #endif |
|
2292 |
|
2293 if(needToDeallocRules){ |
|
2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free |
|
2295 uprv_free((void*)rules); |
|
2296 } |
|
2297 needToDeallocRules = true; |
|
2298 rules = newRules; |
|
2299 rulesLength = newRulesLength; |
|
2300 |
|
2301 estimatedSize += importRulesLength*2; |
|
2302 |
|
2303 // First character of the new rules needs to be processed |
|
2304 i--; |
|
2305 } |
|
2306 } |
|
2307 //openBrace++; |
|
2308 i++; |
|
2309 } |
|
2310 |
|
2311 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); |
|
2312 /* test for NULL */ |
|
2313 if (src->source == NULL) { |
|
2314 *status = U_MEMORY_ALLOCATION_ERROR; |
|
2315 return; |
|
2316 } |
|
2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); |
|
2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); |
|
2319 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { |
|
2320 *status = U_ZERO_ERROR; |
|
2321 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); |
|
2322 /* test for NULL */ |
|
2323 if (src->source == NULL) { |
|
2324 *status = U_MEMORY_ALLOCATION_ERROR; |
|
2325 return; |
|
2326 } |
|
2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
|
2328 } |
|
2329 if(needToDeallocRules){ |
|
2330 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free |
|
2331 uprv_free((void*)rules); |
|
2332 } |
|
2333 |
|
2334 |
|
2335 src->current = src->source; |
|
2336 src->end = src->source+nSize; |
|
2337 src->sourceCurrent = src->source; |
|
2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly |
|
2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
|
2340 src->varTop = NULL; |
|
2341 src->UCA = UCA; |
|
2342 src->invUCA = ucol_initInverseUCA(status); |
|
2343 src->parsedToken.charsLen = 0; |
|
2344 src->parsedToken.charsOffset = 0; |
|
2345 src->parsedToken.extensionLen = 0; |
|
2346 src->parsedToken.extensionOffset = 0; |
|
2347 src->parsedToken.prefixLen = 0; |
|
2348 src->parsedToken.prefixOffset = 0; |
|
2349 src->parsedToken.flags = 0; |
|
2350 src->parsedToken.strength = UCOL_TOK_UNSET; |
|
2351 src->buildCCTabFlag = FALSE; |
|
2352 src->isStarred = FALSE; |
|
2353 src->inRange = FALSE; |
|
2354 src->lastRangeCp = 0; |
|
2355 src->previousCp = 0; |
|
2356 |
|
2357 if(U_FAILURE(*status)) { |
|
2358 return; |
|
2359 } |
|
2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); |
|
2361 if(U_FAILURE(*status)) { |
|
2362 return; |
|
2363 } |
|
2364 uhash_setValueDeleter(src->tailored, uprv_free); |
|
2365 |
|
2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
|
2367 /* test for NULL */ |
|
2368 if (src->opts == NULL) { |
|
2369 *status = U_MEMORY_ALLOCATION_ERROR; |
|
2370 return; |
|
2371 } |
|
2372 |
|
2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); |
|
2374 |
|
2375 src->lh = 0; |
|
2376 src->listCapacity = 1024; |
|
2377 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); |
|
2378 //Test for NULL |
|
2379 if (src->lh == NULL) { |
|
2380 *status = U_MEMORY_ALLOCATION_ERROR; |
|
2381 return; |
|
2382 } |
|
2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); |
|
2384 src->resultLen = 0; |
|
2385 |
|
2386 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
|
2387 |
|
2388 // UCOL_RESET_TOP_VALUE |
|
2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
|
2390 // UCOL_FIRST_PRIMARY_IGNORABLE |
|
2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); |
|
2392 // UCOL_LAST_PRIMARY_IGNORABLE |
|
2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); |
|
2394 // UCOL_FIRST_SECONDARY_IGNORABLE |
|
2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); |
|
2396 // UCOL_LAST_SECONDARY_IGNORABLE |
|
2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); |
|
2398 // UCOL_FIRST_TERTIARY_IGNORABLE |
|
2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); |
|
2400 // UCOL_LAST_TERTIARY_IGNORABLE |
|
2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); |
|
2402 // UCOL_FIRST_VARIABLE |
|
2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); |
|
2404 // UCOL_LAST_VARIABLE |
|
2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); |
|
2406 // UCOL_FIRST_NON_VARIABLE |
|
2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); |
|
2408 // UCOL_LAST_NON_VARIABLE |
|
2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
|
2410 // UCOL_FIRST_IMPLICIT |
|
2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); |
|
2412 // UCOL_LAST_IMPLICIT |
|
2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); |
|
2414 // UCOL_FIRST_TRAILING |
|
2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); |
|
2416 // UCOL_LAST_TRAILING |
|
2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); |
|
2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); |
|
2419 } |
|
2420 |
|
2421 |
|
2422 void ucol_tok_closeTokenList(UColTokenParser *src) { |
|
2423 if(src->copySet != NULL) { |
|
2424 uset_close(src->copySet); |
|
2425 } |
|
2426 if(src->removeSet != NULL) { |
|
2427 uset_close(src->removeSet); |
|
2428 } |
|
2429 if(src->tailored != NULL) { |
|
2430 uhash_close(src->tailored); |
|
2431 } |
|
2432 if(src->lh != NULL) { |
|
2433 uprv_free(src->lh); |
|
2434 } |
|
2435 if(src->source != NULL) { |
|
2436 uprv_free(src->source); |
|
2437 } |
|
2438 if(src->opts != NULL) { |
|
2439 uprv_free(src->opts); |
|
2440 } |
|
2441 if (src->reorderCodes != NULL) { |
|
2442 uprv_free(src->reorderCodes); |
|
2443 } |
|
2444 } |
|
2445 |
|
2446 #endif /* #if !UCONFIG_NO_COLLATION */ |