|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2001-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: ucol_bld.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created 02/22/2001 |
|
14 * created by: Vladimir Weinstein |
|
15 * |
|
16 * This module builds a collator based on the rule set. |
|
17 * |
|
18 */ |
|
19 |
|
20 #include "unicode/utypes.h" |
|
21 |
|
22 #if !UCONFIG_NO_COLLATION |
|
23 |
|
24 #include "unicode/ucoleitr.h" |
|
25 #include "unicode/udata.h" |
|
26 #include "unicode/uchar.h" |
|
27 #include "unicode/uniset.h" |
|
28 #include "unicode/uscript.h" |
|
29 #include "unicode/ustring.h" |
|
30 #include "unicode/utf16.h" |
|
31 #include "normalizer2impl.h" |
|
32 #include "uassert.h" |
|
33 #include "ucol_bld.h" |
|
34 #include "ucol_elm.h" |
|
35 #include "ucol_cnt.h" |
|
36 #include "ucln_in.h" |
|
37 #include "umutex.h" |
|
38 #include "cmemory.h" |
|
39 #include "cstring.h" |
|
40 |
|
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
42 |
|
43 static const InverseUCATableHeader* _staticInvUCA = NULL; |
|
44 static UDataMemory* invUCA_DATA_MEM = NULL; |
|
45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; |
|
46 |
|
47 U_CDECL_BEGIN |
|
48 static UBool U_CALLCONV |
|
49 isAcceptableInvUCA(void * /*context*/, |
|
50 const char * /*type*/, const char * /*name*/, |
|
51 const UDataInfo *pInfo) |
|
52 { |
|
53 /* context, type & name are intentionally not used */ |
|
54 if( pInfo->size>=20 && |
|
55 pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
|
56 pInfo->charsetFamily==U_CHARSET_FAMILY && |
|
57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ |
|
58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && |
|
59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && |
|
60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && |
|
61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && |
|
62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& |
|
63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && |
|
64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && |
|
65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && |
|
66 ) |
|
67 { |
|
68 // TODO: Check that the invuca data version (pInfo->dataVersion) |
|
69 // matches the ucadata version. |
|
70 return TRUE; |
|
71 } else { |
|
72 return FALSE; |
|
73 } |
|
74 } |
|
75 U_CDECL_END |
|
76 |
|
77 /* |
|
78 * Takes two CEs (lead and continuation) and |
|
79 * compares them as CEs should be compared: |
|
80 * primary vs. primary, secondary vs. secondary |
|
81 * tertiary vs. tertiary |
|
82 */ |
|
83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { |
|
84 uint32_t s1 = source0, s2, t1 = target0, t2; |
|
85 if(isContinuation(source1)) { |
|
86 s2 = source1; |
|
87 } else { |
|
88 s2 = 0; |
|
89 } |
|
90 if(isContinuation(target1)) { |
|
91 t2 = target1; |
|
92 } else { |
|
93 t2 = 0; |
|
94 } |
|
95 |
|
96 uint32_t s = 0, t = 0; |
|
97 if(s1 == t1 && s2 == t2) { |
|
98 return 0; |
|
99 } |
|
100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); |
|
101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); |
|
102 if(s < t) { |
|
103 return -1; |
|
104 } else if(s > t) { |
|
105 return 1; |
|
106 } else { |
|
107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; |
|
108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; |
|
109 if(s < t) { |
|
110 return -1; |
|
111 } else if(s > t) { |
|
112 return 1; |
|
113 } else { |
|
114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); |
|
115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); |
|
116 if(s < t) { |
|
117 return -1; |
|
118 } else { |
|
119 return 1; |
|
120 } |
|
121 } |
|
122 } |
|
123 } |
|
124 |
|
125 static |
|
126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { |
|
127 uint32_t bottom = 0, top = src->invUCA->tableSize; |
|
128 uint32_t i = 0; |
|
129 uint32_t first = 0, second = 0; |
|
130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
131 int32_t res = 0; |
|
132 |
|
133 while(bottom < top-1) { |
|
134 i = (top+bottom)/2; |
|
135 first = *(CETable+3*i); |
|
136 second = *(CETable+3*i+1); |
|
137 res = compareCEs(first, second, CE, SecondCE); |
|
138 if(res > 0) { |
|
139 top = i; |
|
140 } else if(res < 0) { |
|
141 bottom = i; |
|
142 } else { |
|
143 break; |
|
144 } |
|
145 } |
|
146 |
|
147 /* weiv: */ |
|
148 /* in searching for elements, I have removed the failure */ |
|
149 /* The reason for this is that the builder does not rely */ |
|
150 /* on search mechanism telling it that it didn't find an */ |
|
151 /* element. However, indirect positioning relies on being */ |
|
152 /* able to find the elements around any CE, even if it is */ |
|
153 /* not defined in the UCA. */ |
|
154 return i; |
|
155 /* |
|
156 if((first == CE && second == SecondCE)) { |
|
157 return i; |
|
158 } else { |
|
159 return -1; |
|
160 } |
|
161 */ |
|
162 } |
|
163 |
|
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { |
|
165 0xFFFF0000, |
|
166 0xFFFFFF00, |
|
167 0xFFFFFFFF |
|
168 }; |
|
169 |
|
170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, |
|
171 uint32_t CE, uint32_t contCE, |
|
172 uint32_t *nextCE, uint32_t *nextContCE, |
|
173 uint32_t strength) |
|
174 { |
|
175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
176 int32_t iCE; |
|
177 |
|
178 iCE = ucol_inv_findCE(src, CE, contCE); |
|
179 |
|
180 if(iCE<0) { |
|
181 *nextCE = UCOL_NOT_FOUND; |
|
182 return -1; |
|
183 } |
|
184 |
|
185 CE &= strengthMask[strength]; |
|
186 contCE &= strengthMask[strength]; |
|
187 |
|
188 *nextCE = CE; |
|
189 *nextContCE = contCE; |
|
190 |
|
191 while((*nextCE & strengthMask[strength]) == CE |
|
192 && (*nextContCE & strengthMask[strength]) == contCE) |
|
193 { |
|
194 *nextCE = (*(CETable+3*(++iCE))); |
|
195 *nextContCE = (*(CETable+3*(iCE)+1)); |
|
196 } |
|
197 |
|
198 return iCE; |
|
199 } |
|
200 |
|
201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, |
|
202 uint32_t CE, uint32_t contCE, |
|
203 uint32_t *prevCE, uint32_t *prevContCE, |
|
204 uint32_t strength) |
|
205 { |
|
206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
207 int32_t iCE; |
|
208 |
|
209 iCE = ucol_inv_findCE(src, CE, contCE); |
|
210 |
|
211 if(iCE<0) { |
|
212 *prevCE = UCOL_NOT_FOUND; |
|
213 return -1; |
|
214 } |
|
215 |
|
216 CE &= strengthMask[strength]; |
|
217 contCE &= strengthMask[strength]; |
|
218 |
|
219 *prevCE = CE; |
|
220 *prevContCE = contCE; |
|
221 |
|
222 while((*prevCE & strengthMask[strength]) == CE |
|
223 && (*prevContCE & strengthMask[strength])== contCE |
|
224 && iCE > 0) /* this condition should prevent falling off the edge of the world */ |
|
225 { |
|
226 /* here, we end up in a singularity - zero */ |
|
227 *prevCE = (*(CETable+3*(--iCE))); |
|
228 *prevContCE = (*(CETable+3*(iCE)+1)); |
|
229 } |
|
230 |
|
231 return iCE; |
|
232 } |
|
233 |
|
234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, |
|
235 uint32_t prevCE, uint32_t prevContCE) |
|
236 { |
|
237 if(prevCE == CE && prevContCE == contCE) { |
|
238 return UCOL_IDENTICAL; |
|
239 } |
|
240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) |
|
241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) |
|
242 { |
|
243 return UCOL_PRIMARY; |
|
244 } |
|
245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) |
|
246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) |
|
247 { |
|
248 return UCOL_SECONDARY; |
|
249 } |
|
250 return UCOL_TERTIARY; |
|
251 } |
|
252 |
|
253 |
|
254 /*static |
|
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
|
256 |
|
257 uint32_t CE = lh->baseCE; |
|
258 uint32_t SecondCE = lh->baseContCE; |
|
259 |
|
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
261 uint32_t previousCE, previousContCE; |
|
262 int32_t iCE; |
|
263 |
|
264 iCE = ucol_inv_findCE(src, CE, SecondCE); |
|
265 |
|
266 if(iCE<0) { |
|
267 return -1; |
|
268 } |
|
269 |
|
270 CE &= strengthMask[strength]; |
|
271 SecondCE &= strengthMask[strength]; |
|
272 |
|
273 previousCE = CE; |
|
274 previousContCE = SecondCE; |
|
275 |
|
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { |
|
277 previousCE = (*(CETable+3*(--iCE))); |
|
278 previousContCE = (*(CETable+3*(iCE)+1)); |
|
279 } |
|
280 lh->previousCE = previousCE; |
|
281 lh->previousContCE = previousContCE; |
|
282 |
|
283 return iCE; |
|
284 }*/ |
|
285 |
|
286 static |
|
287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { |
|
288 uint32_t CE = lh->baseCE; |
|
289 uint32_t SecondCE = lh->baseContCE; |
|
290 |
|
291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
292 uint32_t nextCE, nextContCE; |
|
293 int32_t iCE; |
|
294 |
|
295 iCE = ucol_inv_findCE(src, CE, SecondCE); |
|
296 |
|
297 if(iCE<0) { |
|
298 return -1; |
|
299 } |
|
300 |
|
301 CE &= strengthMask[strength]; |
|
302 SecondCE &= strengthMask[strength]; |
|
303 |
|
304 nextCE = CE; |
|
305 nextContCE = SecondCE; |
|
306 |
|
307 while((nextCE & strengthMask[strength]) == CE |
|
308 && (nextContCE & strengthMask[strength]) == SecondCE) |
|
309 { |
|
310 nextCE = (*(CETable+3*(++iCE))); |
|
311 nextContCE = (*(CETable+3*(iCE)+1)); |
|
312 } |
|
313 |
|
314 lh->nextCE = nextCE; |
|
315 lh->nextContCE = nextContCE; |
|
316 |
|
317 return iCE; |
|
318 } |
|
319 |
|
320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { |
|
321 /* reset all the gaps */ |
|
322 int32_t i = 0; |
|
323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
|
324 uint32_t st = 0; |
|
325 uint32_t t1, t2; |
|
326 int32_t pos; |
|
327 |
|
328 UColToken *tok = lh->first; |
|
329 uint32_t tokStrength = tok->strength; |
|
330 |
|
331 for(i = 0; i<3; i++) { |
|
332 lh->gapsHi[3*i] = 0; |
|
333 lh->gapsHi[3*i+1] = 0; |
|
334 lh->gapsHi[3*i+2] = 0; |
|
335 lh->gapsLo[3*i] = 0; |
|
336 lh->gapsLo[3*i+1] = 0; |
|
337 lh->gapsLo[3*i+2] = 0; |
|
338 lh->numStr[i] = 0; |
|
339 lh->fStrToken[i] = NULL; |
|
340 lh->lStrToken[i] = NULL; |
|
341 lh->pos[i] = -1; |
|
342 } |
|
343 |
|
344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
|
345 |
|
346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
|
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ |
|
348 lh->pos[0] = 0; |
|
349 t1 = lh->baseCE; |
|
350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; |
|
351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
|
354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); |
|
355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); |
|
356 |
|
357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; |
|
358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; |
|
359 |
|
360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
|
363 } else if(lh->indirect == TRUE && lh->nextCE != 0) { |
|
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { |
|
365 lh->pos[0] = 0; |
|
366 t1 = lh->baseCE; |
|
367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; |
|
368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
|
371 t1 = lh->nextCE; |
|
372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; |
|
373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
|
376 } else { |
|
377 for(;;) { |
|
378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
|
379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { |
|
380 lh->fStrToken[tokStrength] = tok; |
|
381 } else { /* The CE must be implicit, since it's not in the table */ |
|
382 /* Error */ |
|
383 *status = U_INTERNAL_PROGRAM_ERROR; |
|
384 } |
|
385 } |
|
386 |
|
387 while(tok != NULL && tok->strength >= tokStrength) { |
|
388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { |
|
389 lh->lStrToken[tokStrength] = tok; |
|
390 } |
|
391 tok = tok->next; |
|
392 } |
|
393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { |
|
394 /* check if previous interval is the same and merge the intervals if it is so */ |
|
395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { |
|
396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; |
|
397 lh->fStrToken[tokStrength+1] = NULL; |
|
398 lh->lStrToken[tokStrength+1] = NULL; |
|
399 lh->pos[tokStrength+1] = -1; |
|
400 } |
|
401 } |
|
402 if(tok != NULL) { |
|
403 tokStrength = tok->strength; |
|
404 } else { |
|
405 break; |
|
406 } |
|
407 } |
|
408 for(st = 0; st < 3; st++) { |
|
409 if((pos = lh->pos[st]) >= 0) { |
|
410 t1 = *(CETable+3*(pos)); |
|
411 t2 = *(CETable+3*(pos)+1); |
|
412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; |
|
415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
|
416 //pos--; |
|
417 //t1 = *(CETable+3*(pos)); |
|
418 //t2 = *(CETable+3*(pos)+1); |
|
419 t1 = lh->baseCE; |
|
420 t2 = lh->baseContCE; |
|
421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; |
|
422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; |
|
423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; |
|
424 } |
|
425 } |
|
426 } |
|
427 } |
|
428 |
|
429 |
|
430 #define ucol_countBytes(value, noOfBytes) \ |
|
431 { \ |
|
432 uint32_t mask = 0xFFFFFFFF; \ |
|
433 (noOfBytes) = 0; \ |
|
434 while(mask != 0) { \ |
|
435 if(((value) & mask) != 0) { \ |
|
436 (noOfBytes)++; \ |
|
437 } \ |
|
438 mask >>= 8; \ |
|
439 } \ |
|
440 } |
|
441 |
|
442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { |
|
443 if(U_SUCCESS(*status)) { |
|
444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
|
445 } |
|
446 return g->current; |
|
447 } |
|
448 |
|
449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { |
|
450 /* TODO: rename to enum names */ |
|
451 uint32_t high, low, count=1; |
|
452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; |
|
453 |
|
454 if(strength == UCOL_SECONDARY) { |
|
455 low = UCOL_COMMON_TOP2<<24; |
|
456 high = 0xFFFFFFFF; |
|
457 count = 0xFF - UCOL_COMMON_TOP2; |
|
458 } else { |
|
459 low = UCOL_BYTE_COMMON << 24; //0x05000000; |
|
460 high = 0x40000000; |
|
461 count = 0x40 - UCOL_BYTE_COMMON; |
|
462 } |
|
463 |
|
464 if(tok->next != NULL && tok->next->strength == strength) { |
|
465 count = tok->next->toInsert; |
|
466 } |
|
467 |
|
468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
|
469 g->current = UCOL_BYTE_COMMON<<24; |
|
470 |
|
471 if(g->noOfRanges == 0) { |
|
472 *status = U_INTERNAL_PROGRAM_ERROR; |
|
473 } |
|
474 return g->current; |
|
475 } |
|
476 |
|
477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { |
|
478 uint32_t strength = tok->strength; |
|
479 uint32_t low = lows[fStrength*3+strength]; |
|
480 uint32_t high = highs[fStrength*3+strength]; |
|
481 uint32_t maxByte = 0; |
|
482 if(strength == UCOL_TERTIARY) { |
|
483 maxByte = 0x3F; |
|
484 } else if(strength == UCOL_PRIMARY) { |
|
485 maxByte = 0xFE; |
|
486 } else { |
|
487 maxByte = 0xFF; |
|
488 } |
|
489 |
|
490 uint32_t count = tok->toInsert; |
|
491 |
|
492 if(low >= high && strength > UCOL_PRIMARY) { |
|
493 int32_t s = strength; |
|
494 for(;;) { |
|
495 s--; |
|
496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { |
|
497 if(strength == UCOL_SECONDARY) { |
|
498 if (low < UCOL_COMMON_TOP2<<24 ) { |
|
499 // Override if low range is less than UCOL_COMMON_TOP2. |
|
500 low = UCOL_COMMON_TOP2<<24; |
|
501 } |
|
502 high = 0xFFFFFFFF; |
|
503 } else { |
|
504 // Override if low range is less than UCOL_COMMON_BOT3. |
|
505 if ( low < UCOL_COMMON_BOT3<<24 ) { |
|
506 low = UCOL_COMMON_BOT3<<24; |
|
507 } |
|
508 high = 0x40000000; |
|
509 } |
|
510 break; |
|
511 } |
|
512 if(s<0) { |
|
513 *status = U_INTERNAL_PROGRAM_ERROR; |
|
514 return 0; |
|
515 } |
|
516 } |
|
517 } |
|
518 |
|
519 if(low < 0x02000000) { |
|
520 // We must not use CE weight byte 02, so we set it as the minimum lower bound. |
|
521 // See http://site.icu-project.org/design/collation/bytes |
|
522 low = 0x02000000; |
|
523 } |
|
524 |
|
525 if(strength == UCOL_SECONDARY) { /* similar as simple */ |
|
526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { |
|
527 low = UCOL_COMMON_TOP2<<24; |
|
528 } |
|
529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { |
|
530 high = UCOL_COMMON_TOP2<<24; |
|
531 } |
|
532 if(low < (UCOL_COMMON_BOT2<<24)) { |
|
533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); |
|
534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
|
535 //g->current = UCOL_COMMON_BOT2<<24; |
|
536 return g->current; |
|
537 } |
|
538 } |
|
539 |
|
540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); |
|
541 if(g->noOfRanges == 0) { |
|
542 *status = U_INTERNAL_PROGRAM_ERROR; |
|
543 } |
|
544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); |
|
545 return g->current; |
|
546 } |
|
547 |
|
548 static |
|
549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { |
|
550 uint32_t i = 0; |
|
551 UChar c; |
|
552 |
|
553 if(U_FAILURE(*status)) { |
|
554 return 0; |
|
555 } |
|
556 |
|
557 if(sourceLen > resLen) { |
|
558 *status = U_MEMORY_ALLOCATION_ERROR; |
|
559 return 0; |
|
560 } |
|
561 |
|
562 for(i = 0; i < sourceLen; i++) { |
|
563 c = source[i]; |
|
564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
|
565 switch(c - 0x3000) { |
|
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: |
|
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: |
|
568 c++; |
|
569 break; |
|
570 case 0xF5: |
|
571 c = 0x30AB; |
|
572 break; |
|
573 case 0xF6: |
|
574 c = 0x30B1; |
|
575 break; |
|
576 } |
|
577 } |
|
578 resBuf[i] = c; |
|
579 } |
|
580 return sourceLen; |
|
581 } |
|
582 |
|
583 static |
|
584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { |
|
585 uint32_t i = 0; |
|
586 UChar c; |
|
587 |
|
588 if(U_FAILURE(*status)) { |
|
589 return 0; |
|
590 } |
|
591 |
|
592 if(sourceLen > resLen) { |
|
593 *status = U_MEMORY_ALLOCATION_ERROR; |
|
594 return 0; |
|
595 } |
|
596 |
|
597 for(i = 0; i < sourceLen; i++) { |
|
598 c = source[i]; |
|
599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ |
|
600 switch(c - 0x3000) { |
|
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: |
|
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: |
|
603 c--; |
|
604 break; |
|
605 case 0xAB: |
|
606 c = 0x30F5; |
|
607 break; |
|
608 case 0xB1: |
|
609 c = 0x30F6; |
|
610 break; |
|
611 } |
|
612 } |
|
613 resBuf[i] = c; |
|
614 } |
|
615 return sourceLen; |
|
616 } |
|
617 |
|
618 U_NAMESPACE_BEGIN |
|
619 |
|
620 static |
|
621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { |
|
622 uint32_t i = 0; |
|
623 UChar n[128]; |
|
624 uint32_t nLen = 0; |
|
625 uint32_t uCount = 0, lCount = 0; |
|
626 |
|
627 collIterate s; |
|
628 uint32_t order = 0; |
|
629 |
|
630 if(U_FAILURE(*status)) { |
|
631 return UCOL_LOWER_CASE; |
|
632 } |
|
633 |
|
634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); |
|
635 if(U_SUCCESS(*status)) { |
|
636 for(i = 0; i < nLen; i++) { |
|
637 uprv_init_collIterate(UCA, &n[i], 1, &s, status); |
|
638 order = ucol_getNextCE(UCA, &s, status); |
|
639 if(isContinuation(order)) { |
|
640 *status = U_INTERNAL_PROGRAM_ERROR; |
|
641 return UCOL_LOWER_CASE; |
|
642 } |
|
643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { |
|
644 uCount++; |
|
645 } else { |
|
646 if(u_islower(n[i])) { |
|
647 lCount++; |
|
648 } else if(U_SUCCESS(*status)) { |
|
649 UChar sk[1], lk[1]; |
|
650 u_toSmallKana(&n[i], 1, sk, 1, status); |
|
651 u_toLargeKana(&n[i], 1, lk, 1, status); |
|
652 if(sk[0] == n[i] && lk[0] != n[i]) { |
|
653 lCount++; |
|
654 } |
|
655 } |
|
656 } |
|
657 } |
|
658 } |
|
659 |
|
660 if(uCount != 0 && lCount != 0) { |
|
661 return UCOL_MIXED_CASE; |
|
662 } else if(uCount != 0) { |
|
663 return UCOL_UPPER_CASE; |
|
664 } else { |
|
665 return UCOL_LOWER_CASE; |
|
666 } |
|
667 } |
|
668 |
|
669 |
|
670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { |
|
671 /* this one makes the table and stuff */ |
|
672 uint32_t noOfBytes[3]; |
|
673 uint32_t i; |
|
674 |
|
675 for(i = 0; i<3; i++) { |
|
676 ucol_countBytes(CEparts[i], noOfBytes[i]); |
|
677 } |
|
678 |
|
679 /* Here we have to pack CEs from parts */ |
|
680 |
|
681 uint32_t CEi = 0; |
|
682 uint32_t value = 0; |
|
683 |
|
684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { |
|
685 if(CEi > 0) { |
|
686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ |
|
687 } else { |
|
688 value = 0; |
|
689 } |
|
690 |
|
691 if(2*CEi<noOfBytes[0]) { |
|
692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; |
|
693 } |
|
694 if(CEi<noOfBytes[1]) { |
|
695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; |
|
696 } |
|
697 if(CEi<noOfBytes[2]) { |
|
698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); |
|
699 } |
|
700 tok->CEs[CEi] = value; |
|
701 CEi++; |
|
702 } |
|
703 if(CEi == 0) { /* totally ignorable */ |
|
704 tok->noOfCEs = 1; |
|
705 tok->CEs[0] = 0; |
|
706 } else { /* there is at least something */ |
|
707 tok->noOfCEs = CEi; |
|
708 } |
|
709 |
|
710 |
|
711 // we want to set case bits here and now, not later. |
|
712 // Case bits handling |
|
713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables |
|
714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field |
|
715 int32_t cSize = (tok->source & 0xFF000000) >> 24; |
|
716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; |
|
717 |
|
718 if(cSize > 1) { |
|
719 // Do it manually |
|
720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); |
|
721 } else { |
|
722 // Copy it from the UCA |
|
723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); |
|
724 tok->CEs[0] |= (caseCE & 0xC0); |
|
725 } |
|
726 } |
|
727 |
|
728 #if UCOL_DEBUG==2 |
|
729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); |
|
730 for(i = 0; i<tok->noOfCEs; i++) { |
|
731 fprintf(stderr, "%08X ", tok->CEs[i]); |
|
732 } |
|
733 fprintf(stderr, "\n"); |
|
734 #endif |
|
735 } |
|
736 |
|
737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { |
|
738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; |
|
739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; |
|
740 |
|
741 UColToken *tok = lh->last; |
|
742 uint32_t t[UCOL_STRENGTH_LIMIT]; |
|
743 |
|
744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); |
|
745 |
|
746 /* must initialize ranges to avoid memory check warnings */ |
|
747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { |
|
748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); |
|
749 } |
|
750 |
|
751 tok->toInsert = 1; |
|
752 t[tok->strength] = 1; |
|
753 |
|
754 while(tok->previous != NULL) { |
|
755 if(tok->previous->strength < tok->strength) { /* going up */ |
|
756 t[tok->strength] = 0; |
|
757 t[tok->previous->strength]++; |
|
758 } else if(tok->previous->strength > tok->strength) { /* going down */ |
|
759 t[tok->previous->strength] = 1; |
|
760 } else { |
|
761 t[tok->strength]++; |
|
762 } |
|
763 tok=tok->previous; |
|
764 tok->toInsert = t[tok->strength]; |
|
765 } |
|
766 |
|
767 tok->toInsert = t[tok->strength]; |
|
768 ucol_inv_getGapPositions(src, lh, status); |
|
769 |
|
770 #if UCOL_DEBUG |
|
771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); |
|
772 int32_t j = 2; |
|
773 for(j = 2; j >= 0; j--) { |
|
774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); |
|
775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); |
|
776 } |
|
777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
|
778 |
|
779 do { |
|
780 fprintf(stderr,"%i", tok->strength); |
|
781 tok = tok->next; |
|
782 } while(tok != NULL); |
|
783 fprintf(stderr, "\n"); |
|
784 |
|
785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; |
|
786 |
|
787 do { |
|
788 fprintf(stderr,"%i", tok->toInsert); |
|
789 tok = tok->next; |
|
790 } while(tok != NULL); |
|
791 #endif |
|
792 |
|
793 tok = lh->first; |
|
794 uint32_t fStrength = UCOL_IDENTICAL; |
|
795 uint32_t initStrength = UCOL_IDENTICAL; |
|
796 |
|
797 |
|
798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; |
|
799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; |
|
800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; |
|
801 |
|
802 while (tok != NULL && U_SUCCESS(*status)) { |
|
803 fStrength = tok->strength; |
|
804 if(fStrength < initStrength) { |
|
805 initStrength = fStrength; |
|
806 if(lh->pos[fStrength] == -1) { |
|
807 while(lh->pos[fStrength] == -1 && fStrength > 0) { |
|
808 fStrength--; |
|
809 } |
|
810 if(lh->pos[fStrength] == -1) { |
|
811 *status = U_INTERNAL_PROGRAM_ERROR; |
|
812 return; |
|
813 } |
|
814 } |
|
815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ |
|
816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
|
817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; |
|
818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ |
|
819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
|
820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ |
|
821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; |
|
822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ |
|
823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
|
824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
|
825 } else { /* primaries */ |
|
826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ |
|
827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); |
|
828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); |
|
829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
|
830 } |
|
831 } else { |
|
832 if(tok->strength == UCOL_TERTIARY) { |
|
833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); |
|
834 } else if(tok->strength == UCOL_SECONDARY) { |
|
835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); |
|
836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
|
837 } else if(tok->strength == UCOL_PRIMARY) { |
|
838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); |
|
839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); |
|
840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); |
|
841 } |
|
842 } |
|
843 ucol_doCE(src, CEparts, tok, status); |
|
844 tok = tok->next; |
|
845 } |
|
846 } |
|
847 |
|
848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { |
|
849 UCAElements el; |
|
850 UColToken *tok = lh->first; |
|
851 UColToken *expt = NULL; |
|
852 uint32_t i = 0, j = 0; |
|
853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); |
|
854 |
|
855 while(tok != NULL && U_SUCCESS(*status)) { |
|
856 /* first, check if there are any expansions */ |
|
857 /* if there are expansions, we need to do a little bit more processing */ |
|
858 /* since parts of expansion can be tailored, while others are not */ |
|
859 if(tok->expansion != 0) { |
|
860 uint32_t len = tok->expansion >> 24; |
|
861 uint32_t currentSequenceLen = len; |
|
862 uint32_t expOffset = tok->expansion & 0x00FFFFFF; |
|
863 //uint32_t exp = currentSequenceLen | expOffset; |
|
864 UColToken exp; |
|
865 exp.source = currentSequenceLen | expOffset; |
|
866 exp.rulesToParseHdl = &(src->source); |
|
867 |
|
868 while(len > 0) { |
|
869 currentSequenceLen = len; |
|
870 while(currentSequenceLen > 0) { |
|
871 exp.source = (currentSequenceLen << 24) | expOffset; |
|
872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ |
|
873 uint32_t noOfCEsToCopy = expt->noOfCEs; |
|
874 for(j = 0; j<noOfCEsToCopy; j++) { |
|
875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; |
|
876 } |
|
877 tok->noOfExpCEs += noOfCEsToCopy; |
|
878 // Smart people never try to add codepoints and CEs. |
|
879 // For some odd reason, it won't work. |
|
880 expOffset += currentSequenceLen; //noOfCEsToCopy; |
|
881 len -= currentSequenceLen; //noOfCEsToCopy; |
|
882 break; |
|
883 } else { |
|
884 currentSequenceLen--; |
|
885 } |
|
886 } |
|
887 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ |
|
888 /* will have to get one from UCA */ |
|
889 /* first, get the UChars from the rules */ |
|
890 /* then pick CEs out until there is no more and stuff them into expansion */ |
|
891 collIterate s; |
|
892 uint32_t order = 0; |
|
893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); |
|
894 |
|
895 for(;;) { |
|
896 order = ucol_getNextCE(src->UCA, &s, status); |
|
897 if(order == UCOL_NO_MORE_CES) { |
|
898 break; |
|
899 } |
|
900 tok->expCEs[tok->noOfExpCEs++] = order; |
|
901 } |
|
902 expOffset++; |
|
903 len--; |
|
904 } |
|
905 } |
|
906 } else { |
|
907 tok->noOfExpCEs = 0; |
|
908 } |
|
909 |
|
910 /* set the ucaelement with obtained values */ |
|
911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; |
|
912 /* copy CEs */ |
|
913 for(i = 0; i<tok->noOfCEs; i++) { |
|
914 el.CEs[i] = tok->CEs[i]; |
|
915 } |
|
916 for(i = 0; i<tok->noOfExpCEs; i++) { |
|
917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; |
|
918 } |
|
919 |
|
920 /* copy UChars */ |
|
921 // We kept prefix and source kind of together, as it is a kind of a contraction. |
|
922 // However, now we have to slice the prefix off the main thing - |
|
923 el.prefix = el.prefixChars; |
|
924 el.cPoints = el.uchars; |
|
925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the |
|
926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND |
|
927 // decomposed elements to the unsaf table. |
|
928 el.prefixSize = tok->prefix>>24; |
|
929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); |
|
930 |
|
931 el.cSize = (tok->source >> 24)-(tok->prefix>>24); |
|
932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); |
|
933 } else { |
|
934 el.prefixSize = 0; |
|
935 *el.prefix = 0; |
|
936 |
|
937 el.cSize = (tok->source >> 24); |
|
938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); |
|
939 } |
|
940 if(src->UCA != NULL) { |
|
941 for(i = 0; i<el.cSize; i++) { |
|
942 if(UCOL_ISJAMO(el.cPoints[i])) { |
|
943 t->image->jamoSpecial = TRUE; |
|
944 } |
|
945 } |
|
946 if (!src->buildCCTabFlag && el.cSize > 0) { |
|
947 // Check the trailing canonical combining class (tccc) of the last character. |
|
948 const UChar *s = el.cPoints + el.cSize; |
|
949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); |
|
950 if ((fcd & 0xff) != 0) { |
|
951 src->buildCCTabFlag = TRUE; |
|
952 } |
|
953 } |
|
954 } |
|
955 |
|
956 /* and then, add it */ |
|
957 #if UCOL_DEBUG==2 |
|
958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); |
|
959 #endif |
|
960 uprv_uca_addAnElement(t, &el, status); |
|
961 |
|
962 #if UCOL_DEBUG_DUPLICATES |
|
963 if(*status != U_ZERO_ERROR) { |
|
964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); |
|
965 *status = U_ZERO_ERROR; |
|
966 } |
|
967 #endif |
|
968 |
|
969 tok = tok->next; |
|
970 } |
|
971 } |
|
972 |
|
973 U_CDECL_BEGIN |
|
974 static UBool U_CALLCONV |
|
975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { |
|
976 UErrorCode status = U_ZERO_ERROR; |
|
977 tempUCATable *t = (tempUCATable *)context; |
|
978 if(value == 0) { |
|
979 while(start < limit) { |
|
980 uint32_t CE = utrie_get32(t->mapping, start, NULL); |
|
981 if(CE == UCOL_NOT_FOUND) { |
|
982 UCAElements el; |
|
983 el.isThai = FALSE; |
|
984 el.prefixSize = 0; |
|
985 el.prefixChars[0] = 0; |
|
986 el.prefix = el.prefixChars; |
|
987 el.cPoints = el.uchars; |
|
988 |
|
989 el.cSize = 0; |
|
990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start); |
|
991 |
|
992 el.noOfCEs = 1; |
|
993 el.CEs[0] = 0; |
|
994 uprv_uca_addAnElement(t, &el, &status); |
|
995 |
|
996 } |
|
997 start++; |
|
998 } |
|
999 } |
|
1000 if(U_FAILURE(status)) { |
|
1001 return FALSE; |
|
1002 } else { |
|
1003 return TRUE; |
|
1004 } |
|
1005 } |
|
1006 U_CDECL_END |
|
1007 |
|
1008 static void |
|
1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, |
|
1010 UChar32 start, UChar32 end, |
|
1011 UErrorCode *status) |
|
1012 { |
|
1013 //UChar decomp[256]; |
|
1014 uint32_t CE = UCOL_NOT_FOUND; |
|
1015 UChar32 u = 0; |
|
1016 UCAElements el; |
|
1017 el.isThai = FALSE; |
|
1018 el.prefixSize = 0; |
|
1019 el.prefixChars[0] = 0; |
|
1020 collIterate colIt; |
|
1021 |
|
1022 if(U_SUCCESS(*status)) { |
|
1023 for(u = start; u<=end; u++) { |
|
1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND |
|
1025 /* this test is for contractions that are missing the starting element. */ |
|
1026 || ((isCntTableElement(CE)) && |
|
1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) |
|
1028 ) |
|
1029 { |
|
1030 el.cSize = 0; |
|
1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); |
|
1032 //decomp[0] = (UChar)u; |
|
1033 //el.uchars[0] = (UChar)u; |
|
1034 el.cPoints = el.uchars; |
|
1035 //el.cSize = 1; |
|
1036 el.noOfCEs = 0; |
|
1037 el.prefix = el.prefixChars; |
|
1038 el.prefixSize = 0; |
|
1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); |
|
1040 // We actually want to check whether this element is a special |
|
1041 // If it is an implicit element (hangul, CJK - we want to copy the |
|
1042 // special, not the resolved CEs) - for hangul, copying resolved |
|
1043 // would just make things the same (there is an expansion and it |
|
1044 // takes approximately the same amount of time to resolve as |
|
1045 // falling back to the UCA). |
|
1046 /* |
|
1047 UTRIE_GET32(src->UCA->mapping, u, CE); |
|
1048 tag = getCETag(CE); |
|
1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG |
|
1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG |
|
1051 || tag == LEAD_SURROGATE_TAG) { |
|
1052 el.CEs[el.noOfCEs++] = CE; |
|
1053 } else { |
|
1054 */ |
|
1055 // It turns out that it does not make sense to keep implicits |
|
1056 // unresolved. The cost of resolving them is big enough so that |
|
1057 // it doesn't make any difference whether we have to go to the UCA |
|
1058 // or not. |
|
1059 { |
|
1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); |
|
1061 while(CE != UCOL_NO_MORE_CES) { |
|
1062 CE = ucol_getNextCE(src->UCA, &colIt, status); |
|
1063 if(CE != UCOL_NO_MORE_CES) { |
|
1064 el.CEs[el.noOfCEs++] = CE; |
|
1065 } |
|
1066 } |
|
1067 } |
|
1068 uprv_uca_addAnElement(t, &el, status); |
|
1069 } |
|
1070 } |
|
1071 } |
|
1072 } |
|
1073 |
|
1074 U_NAMESPACE_END |
|
1075 |
|
1076 U_CFUNC UCATableHeader * |
|
1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { |
|
1078 U_NAMESPACE_USE |
|
1079 |
|
1080 uint32_t i = 0; |
|
1081 if(U_FAILURE(*status)) { |
|
1082 return NULL; |
|
1083 } |
|
1084 /* |
|
1085 2. Eliminate the negative lists by doing the following for each non-null negative list: |
|
1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, |
|
1087 create new ListHeader X |
|
1088 o reverse the list, add to the end of X's positive list. Reset the strength of the |
|
1089 first item you add, based on the stronger strength levels of the two lists. |
|
1090 */ |
|
1091 /* |
|
1092 3. For each ListHeader with a non-null positive list: |
|
1093 */ |
|
1094 /* |
|
1095 o Find all character strings with CEs between the baseCE and the |
|
1096 next/previous CE, at the strength of the first token. Add these to the |
|
1097 tailoring. |
|
1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the |
|
1099 tailoring has & x < z... |
|
1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... |
|
1101 */ |
|
1102 /* It is possible that this part should be done even while constructing list */ |
|
1103 /* The problem is that it is unknown what is going to be the strongest weight */ |
|
1104 /* So we might as well do it here */ |
|
1105 |
|
1106 /* |
|
1107 o Allocate CEs for each token in the list, based on the total number N of the |
|
1108 largest level difference, and the gap G between baseCE and nextCE at that |
|
1109 level. The relation * between the last item and nextCE is the same as the |
|
1110 strongest strength. |
|
1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) |
|
1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap. |
|
1113 Then fit b and c into the secondary gap between a and d, then fit q |
|
1114 into the tertiary gap between b and c. |
|
1115 |
|
1116 o Example: baseCE << b <<< q << c * nextCE(X,2) |
|
1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap. |
|
1118 Then fit q into the tertiary gap between b and c. |
|
1119 o When incrementing primary values, we will not cross high byte |
|
1120 boundaries except where there is only a single-byte primary. That is to |
|
1121 ensure that the script reordering will continue to work. |
|
1122 */ |
|
1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); |
|
1124 /* test for NULL */ |
|
1125 if (image == NULL) { |
|
1126 *status = U_MEMORY_ALLOCATION_ERROR; |
|
1127 return NULL; |
|
1128 } |
|
1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); |
|
1130 |
|
1131 for(i = 0; i<src->resultLen; i++) { |
|
1132 /* now we need to generate the CEs */ |
|
1133 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ |
|
1134 /* According to strength */ |
|
1135 if(U_SUCCESS(*status)) { |
|
1136 if(src->lh[i].first) { // if there are any elements |
|
1137 // due to the way parser works, subsequent tailorings |
|
1138 // may remove all the elements from a sequence, therefore |
|
1139 // leaving an empty tailoring sequence. |
|
1140 ucol_initBuffers(src, &src->lh[i], status); |
|
1141 } |
|
1142 } |
|
1143 if(U_FAILURE(*status)) { |
|
1144 uprv_free(image); |
|
1145 return NULL; |
|
1146 } |
|
1147 } |
|
1148 |
|
1149 if(src->varTop != NULL) { /* stuff the variable top value */ |
|
1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; |
|
1151 /* remove it from the list */ |
|
1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ |
|
1153 src->varTop->listHeader->first = src->varTop->next; |
|
1154 } |
|
1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ |
|
1156 src->varTop->listHeader->last = src->varTop->previous; |
|
1157 } |
|
1158 if(src->varTop->next != NULL) { |
|
1159 src->varTop->next->previous = src->varTop->previous; |
|
1160 } |
|
1161 if(src->varTop->previous != NULL) { |
|
1162 src->varTop->previous->next = src->varTop->next; |
|
1163 } |
|
1164 } |
|
1165 |
|
1166 |
|
1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); |
|
1168 if(U_FAILURE(*status)) { |
|
1169 uprv_free(image); |
|
1170 return NULL; |
|
1171 } |
|
1172 |
|
1173 |
|
1174 /* After this, we have assigned CE values to all regular CEs */ |
|
1175 /* now we will go through list once more and resolve expansions, */ |
|
1176 /* make UCAElements structs and add them to table */ |
|
1177 for(i = 0; i<src->resultLen; i++) { |
|
1178 /* now we need to generate the CEs */ |
|
1179 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ |
|
1180 /* According to strength */ |
|
1181 if(U_SUCCESS(*status)) { |
|
1182 ucol_createElements(src, t, &src->lh[i], status); |
|
1183 } |
|
1184 } |
|
1185 |
|
1186 UCAElements el; |
|
1187 el.isThai = FALSE; |
|
1188 el.prefixSize = 0; |
|
1189 el.prefixChars[0] = 0; |
|
1190 |
|
1191 /* add latin-1 stuff */ |
|
1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); |
|
1193 |
|
1194 /* add stuff for copying */ |
|
1195 if(src->copySet != NULL) { |
|
1196 int32_t i = 0; |
|
1197 UnicodeSet *set = (UnicodeSet *)src->copySet; |
|
1198 for(i = 0; i < set->getRangeCount(); i++) { |
|
1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); |
|
1200 } |
|
1201 } |
|
1202 |
|
1203 if(U_SUCCESS(*status)) { |
|
1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ |
|
1205 |
|
1206 uint32_t tailoredCE = UCOL_NOT_FOUND; |
|
1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); |
|
1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; |
|
1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); |
|
1210 // Check for null pointer |
|
1211 if (ucaEl == NULL) { |
|
1212 *status = U_MEMORY_ALLOCATION_ERROR; |
|
1213 return NULL; |
|
1214 } |
|
1215 while(*conts != 0) { |
|
1216 // A continuation is NUL-terminated and NUL-padded |
|
1217 // except if it has the maximum length. |
|
1218 int32_t contractionLength = maxUCAContractionLength; |
|
1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) { |
|
1220 --contractionLength; |
|
1221 } |
|
1222 UChar32 first; |
|
1223 int32_t firstLength = 0; |
|
1224 U16_NEXT(conts, firstLength, contractionLength, first); |
|
1225 tailoredCE = utrie_get32(t->mapping, first, NULL); |
|
1226 if(tailoredCE != UCOL_NOT_FOUND) { |
|
1227 UBool needToAdd = TRUE; |
|
1228 if(isCntTableElement(tailoredCE)) { |
|
1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { |
|
1230 needToAdd = FALSE; |
|
1231 } |
|
1232 } |
|
1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { |
|
1234 UCAElements elm; |
|
1235 elm.cPoints = el.uchars; |
|
1236 elm.noOfCEs = 0; |
|
1237 elm.uchars[0] = *conts; |
|
1238 elm.uchars[1] = 0; |
|
1239 elm.cSize = 1; |
|
1240 elm.prefixChars[0] = *(conts+2); |
|
1241 elm.isThai = FALSE; |
|
1242 elm.prefix = elm.prefixChars; |
|
1243 elm.prefixSize = 1; |
|
1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); |
|
1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { |
|
1246 needToAdd = TRUE; |
|
1247 } |
|
1248 } |
|
1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
|
1250 needToAdd = FALSE; |
|
1251 } |
|
1252 |
|
1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. |
|
1254 if (*(conts+1) != 0) { // contractions |
|
1255 el.prefix = el.prefixChars; |
|
1256 el.prefixSize = 0; |
|
1257 el.cPoints = el.uchars; |
|
1258 el.noOfCEs = 0; |
|
1259 u_memcpy(el.uchars, conts, contractionLength); |
|
1260 el.cSize = contractionLength; |
|
1261 ucol_setText(ucaEl, el.uchars, el.cSize, status); |
|
1262 } |
|
1263 else { // pre-context character |
|
1264 UChar str[4] = { 0 }; |
|
1265 int32_t len=0; |
|
1266 int32_t preKeyLen=0; |
|
1267 |
|
1268 el.cPoints = el.uchars; |
|
1269 el.noOfCEs = 0; |
|
1270 el.uchars[0] = *conts; |
|
1271 el.uchars[1] = 0; |
|
1272 el.cSize = 1; |
|
1273 el.prefixChars[0] = *(conts+2); |
|
1274 el.prefix = el.prefixChars; |
|
1275 el.prefixSize = 1; |
|
1276 if (el.prefixChars[0]!=0) { |
|
1277 // get CE of prefix character first |
|
1278 str[0]=el.prefixChars[0]; |
|
1279 str[1]=0; |
|
1280 ucol_setText(ucaEl, str, 1, status); |
|
1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) |
|
1282 != UCOL_NULLORDER) { |
|
1283 preKeyLen++; // count number of keys for prefix character |
|
1284 } |
|
1285 str[len++] = el.prefixChars[0]; |
|
1286 } |
|
1287 |
|
1288 str[len++] = el.uchars[0]; |
|
1289 str[len]=0; |
|
1290 ucol_setText(ucaEl, str, len, status); |
|
1291 // Skip the keys for prefix character, then copy the rest to el. |
|
1292 while ((preKeyLen-->0) && |
|
1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { |
|
1294 continue; |
|
1295 } |
|
1296 |
|
1297 } |
|
1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { |
|
1299 el.noOfCEs++; |
|
1300 } |
|
1301 uprv_uca_addAnElement(t, &el, status); |
|
1302 } |
|
1303 |
|
1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { |
|
1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); |
|
1306 } |
|
1307 conts+=maxUCAContractionLength; |
|
1308 } |
|
1309 ucol_closeElements(ucaEl); |
|
1310 } |
|
1311 |
|
1312 // Add completely ignorable elements |
|
1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); |
|
1314 |
|
1315 // add tailoring characters related canonical closures |
|
1316 uprv_uca_canonicalClosure(t, src, NULL, status); |
|
1317 |
|
1318 /* still need to produce compatibility closure */ |
|
1319 |
|
1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status); |
|
1321 |
|
1322 uprv_uca_closeTempTable(t); |
|
1323 uprv_free(image); |
|
1324 |
|
1325 return myData; |
|
1326 } |
|
1327 |
|
1328 U_CDECL_BEGIN |
|
1329 static UBool U_CALLCONV |
|
1330 ucol_bld_cleanup(void) |
|
1331 { |
|
1332 udata_close(invUCA_DATA_MEM); |
|
1333 invUCA_DATA_MEM = NULL; |
|
1334 _staticInvUCA = NULL; |
|
1335 gStaticInvUCAInitOnce.reset(); |
|
1336 return TRUE; |
|
1337 } |
|
1338 U_CDECL_END |
|
1339 |
|
1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) { |
|
1341 U_ASSERT(invUCA_DATA_MEM == NULL); |
|
1342 U_ASSERT(_staticInvUCA == NULL); |
|
1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); |
|
1344 InverseUCATableHeader *newInvUCA = NULL; |
|
1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status); |
|
1346 |
|
1347 if(U_FAILURE(status)) { |
|
1348 if (result) { |
|
1349 udata_close(result); |
|
1350 } |
|
1351 // This is not needed, as we are talking about |
|
1352 // memory we got from UData |
|
1353 //uprv_free(newInvUCA); |
|
1354 return; |
|
1355 } |
|
1356 |
|
1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ |
|
1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); |
|
1359 UCollator *UCA = ucol_initUCA(&status); |
|
1360 // UCA versions of UCA and inverse UCA should match |
|
1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { |
|
1362 status = U_INVALID_FORMAT_ERROR; |
|
1363 udata_close(result); |
|
1364 return; |
|
1365 } |
|
1366 |
|
1367 invUCA_DATA_MEM = result; |
|
1368 _staticInvUCA = newInvUCA; |
|
1369 } |
|
1370 } |
|
1371 |
|
1372 |
|
1373 U_CAPI const InverseUCATableHeader * U_EXPORT2 |
|
1374 ucol_initInverseUCA(UErrorCode *status) |
|
1375 { |
|
1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); |
|
1377 return _staticInvUCA; |
|
1378 } |
|
1379 |
|
1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept |
|
1381 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. |
|
1382 */ |
|
1383 static const char * const ReorderingTokenNames[] = { |
|
1384 "SPACE", |
|
1385 "PUNCT", |
|
1386 "SYMBOL", |
|
1387 "CURRENCY", |
|
1388 "DIGIT" |
|
1389 }; |
|
1390 |
|
1391 static void toUpper(const char* src, char* dst, uint32_t length) { |
|
1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { |
|
1393 *dst = uprv_toupper(*src); |
|
1394 } |
|
1395 *dst = '\0'; |
|
1396 } |
|
1397 |
|
1398 U_INTERNAL int32_t U_EXPORT2 |
|
1399 ucol_findReorderingEntry(const char* name) { |
|
1400 char buffer[32]; |
|
1401 toUpper(name, buffer, 32); |
|
1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { |
|
1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { |
|
1404 return entry + UCOL_REORDER_CODE_FIRST; |
|
1405 } |
|
1406 } |
|
1407 return USCRIPT_INVALID_CODE; |
|
1408 } |
|
1409 |
|
1410 #endif /* #if !UCONFIG_NO_COLLATION */ |