|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 2001-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: utrie2_builder.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2008sep26 (split off from utrie2.c) |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * This is a common implementation of a Unicode trie. |
|
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with |
|
18 * Unicode code points (0..0x10ffff). |
|
19 * This is the second common version of a Unicode trie (hence the name UTrie2). |
|
20 * See utrie2.h for a comparison. |
|
21 * |
|
22 * This file contains only the builder code. |
|
23 * See utrie2.c for the runtime and enumeration code. |
|
24 */ |
|
25 #ifdef UTRIE2_DEBUG |
|
26 # include <stdio.h> |
|
27 #endif |
|
28 |
|
29 #include "unicode/utypes.h" |
|
30 #include "cmemory.h" |
|
31 #include "utrie2.h" |
|
32 #include "utrie2_impl.h" |
|
33 |
|
34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ |
|
35 |
|
36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
37 |
|
38 /* Implementation notes ----------------------------------------------------- */ |
|
39 |
|
40 /* |
|
41 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values |
|
42 * have been chosen to minimize trie sizes overall. |
|
43 * Most of the code is flexible enough to work with a range of values, |
|
44 * within certain limits. |
|
45 * |
|
46 * Exception: Support for separate values for lead surrogate code _units_ |
|
47 * vs. code _points_ was added after the constants were fixed, |
|
48 * and has not been tested nor particularly designed for different constant values. |
|
49 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 |
|
50 * part and back.) |
|
51 * |
|
52 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data |
|
53 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH |
|
54 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction |
|
55 * remapping) stops working. |
|
56 * |
|
57 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() |
|
58 * assumes that a single index-2 block is used for 0x400 code points |
|
59 * corresponding to one lead surrogate. |
|
60 * |
|
61 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains |
|
62 * more than one Unicode plane, and the split of the index-2 table into a BMP |
|
63 * part and a supplementary part, with a gap in between, would not work. |
|
64 * |
|
65 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because |
|
66 * there is data with more than 64k distinct values, |
|
67 * for example for Unihan collation with a separate collation weight per |
|
68 * Han character. |
|
69 */ |
|
70 |
|
71 /* Building a trie ----------------------------------------------------------*/ |
|
72 |
|
73 enum { |
|
74 /** The null index-2 block, following the gap in the index-2 table. */ |
|
75 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, |
|
76 |
|
77 /** The start of allocated index-2 blocks. */ |
|
78 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, |
|
79 |
|
80 /** |
|
81 * The null data block. |
|
82 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, |
|
83 * to work with 6-bit trail bytes from 2-byte UTF-8. |
|
84 */ |
|
85 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, |
|
86 |
|
87 /** The start of allocated data blocks. */ |
|
88 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, |
|
89 |
|
90 /** |
|
91 * The start of data blocks for U+0800 and above. |
|
92 * Below, compaction uses a block length of 64 for 2-byte UTF-8. |
|
93 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. |
|
94 * Data values for 0x780 code points beyond ASCII. |
|
95 */ |
|
96 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 |
|
97 }; |
|
98 |
|
99 /* Start with allocation of 16k data entries. */ |
|
100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) |
|
101 |
|
102 /* Grow about 8x each time. */ |
|
103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) |
|
104 |
|
105 static int32_t |
|
106 allocIndex2Block(UNewTrie2 *trie); |
|
107 |
|
108 U_CAPI UTrie2 * U_EXPORT2 |
|
109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { |
|
110 UTrie2 *trie; |
|
111 UNewTrie2 *newTrie; |
|
112 uint32_t *data; |
|
113 int32_t i, j; |
|
114 |
|
115 if(U_FAILURE(*pErrorCode)) { |
|
116 return NULL; |
|
117 } |
|
118 |
|
119 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
|
120 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
|
121 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); |
|
122 if(trie==NULL || newTrie==NULL || data==NULL) { |
|
123 uprv_free(trie); |
|
124 uprv_free(newTrie); |
|
125 uprv_free(data); |
|
126 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
127 return 0; |
|
128 } |
|
129 |
|
130 uprv_memset(trie, 0, sizeof(UTrie2)); |
|
131 trie->initialValue=initialValue; |
|
132 trie->errorValue=errorValue; |
|
133 trie->highStart=0x110000; |
|
134 trie->newTrie=newTrie; |
|
135 |
|
136 newTrie->data=data; |
|
137 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; |
|
138 newTrie->initialValue=initialValue; |
|
139 newTrie->errorValue=errorValue; |
|
140 newTrie->highStart=0x110000; |
|
141 newTrie->firstFreeBlock=0; /* no free block in the list */ |
|
142 newTrie->isCompacted=FALSE; |
|
143 |
|
144 /* |
|
145 * preallocate and reset |
|
146 * - ASCII |
|
147 * - the bad-UTF-8-data block |
|
148 * - the null data block |
|
149 */ |
|
150 for(i=0; i<0x80; ++i) { |
|
151 newTrie->data[i]=initialValue; |
|
152 } |
|
153 for(; i<0xc0; ++i) { |
|
154 newTrie->data[i]=errorValue; |
|
155 } |
|
156 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { |
|
157 newTrie->data[i]=initialValue; |
|
158 } |
|
159 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; |
|
160 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; |
|
161 |
|
162 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ |
|
163 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
|
164 newTrie->index2[i]=j; |
|
165 newTrie->map[i]=1; |
|
166 } |
|
167 /* reference counts for the bad-UTF-8-data block */ |
|
168 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
|
169 newTrie->map[i]=0; |
|
170 } |
|
171 /* |
|
172 * Reference counts for the null data block: all blocks except for the ASCII blocks. |
|
173 * Plus 1 so that we don't drop this block during compaction. |
|
174 * Plus as many as needed for lead surrogate code points. |
|
175 */ |
|
176 /* i==newTrie->dataNullOffset */ |
|
177 newTrie->map[i++]= |
|
178 (0x110000>>UTRIE2_SHIFT_2)- |
|
179 (0x80>>UTRIE2_SHIFT_2)+ |
|
180 1+ |
|
181 UTRIE2_LSCP_INDEX_2_LENGTH; |
|
182 j+=UTRIE2_DATA_BLOCK_LENGTH; |
|
183 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
|
184 newTrie->map[i]=0; |
|
185 } |
|
186 |
|
187 /* |
|
188 * set the remaining indexes in the BMP index-2 block |
|
189 * to the null data block |
|
190 */ |
|
191 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { |
|
192 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; |
|
193 } |
|
194 |
|
195 /* |
|
196 * Fill the index gap with impossible values so that compaction |
|
197 * does not overlap other index-2 blocks with the gap. |
|
198 */ |
|
199 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { |
|
200 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; |
|
201 } |
|
202 |
|
203 /* set the indexes in the null index-2 block */ |
|
204 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { |
|
205 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; |
|
206 } |
|
207 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
|
208 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; |
|
209 |
|
210 /* set the index-1 indexes for the linear index-2 block */ |
|
211 for(i=0, j=0; |
|
212 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
|
213 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH |
|
214 ) { |
|
215 newTrie->index1[i]=j; |
|
216 } |
|
217 |
|
218 /* set the remaining index-1 indexes to the null index-2 block */ |
|
219 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
|
220 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
|
221 } |
|
222 |
|
223 /* |
|
224 * Preallocate and reset data for U+0080..U+07ff, |
|
225 * for 2-byte UTF-8 which will be compacted in 64-blocks |
|
226 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. |
|
227 */ |
|
228 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { |
|
229 utrie2_set32(trie, i, initialValue, pErrorCode); |
|
230 } |
|
231 |
|
232 return trie; |
|
233 } |
|
234 |
|
235 static UNewTrie2 * |
|
236 cloneBuilder(const UNewTrie2 *other) { |
|
237 UNewTrie2 *trie; |
|
238 |
|
239 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
|
240 if(trie==NULL) { |
|
241 return NULL; |
|
242 } |
|
243 |
|
244 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); |
|
245 if(trie->data==NULL) { |
|
246 uprv_free(trie); |
|
247 return NULL; |
|
248 } |
|
249 trie->dataCapacity=other->dataCapacity; |
|
250 |
|
251 /* clone data */ |
|
252 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); |
|
253 uprv_memcpy(trie->index2, other->index2, other->index2Length*4); |
|
254 trie->index2NullOffset=other->index2NullOffset; |
|
255 trie->index2Length=other->index2Length; |
|
256 |
|
257 uprv_memcpy(trie->data, other->data, other->dataLength*4); |
|
258 trie->dataNullOffset=other->dataNullOffset; |
|
259 trie->dataLength=other->dataLength; |
|
260 |
|
261 /* reference counters */ |
|
262 if(other->isCompacted) { |
|
263 trie->firstFreeBlock=0; |
|
264 } else { |
|
265 uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4); |
|
266 trie->firstFreeBlock=other->firstFreeBlock; |
|
267 } |
|
268 |
|
269 trie->initialValue=other->initialValue; |
|
270 trie->errorValue=other->errorValue; |
|
271 trie->highStart=other->highStart; |
|
272 trie->isCompacted=other->isCompacted; |
|
273 |
|
274 return trie; |
|
275 } |
|
276 |
|
277 U_CAPI UTrie2 * U_EXPORT2 |
|
278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { |
|
279 UTrie2 *trie; |
|
280 |
|
281 if(U_FAILURE(*pErrorCode)) { |
|
282 return NULL; |
|
283 } |
|
284 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
|
285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
286 return NULL; |
|
287 } |
|
288 |
|
289 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
|
290 if(trie==NULL) { |
|
291 return NULL; |
|
292 } |
|
293 uprv_memcpy(trie, other, sizeof(UTrie2)); |
|
294 |
|
295 if(other->memory!=NULL) { |
|
296 trie->memory=uprv_malloc(other->length); |
|
297 if(trie->memory!=NULL) { |
|
298 trie->isMemoryOwned=TRUE; |
|
299 uprv_memcpy(trie->memory, other->memory, other->length); |
|
300 |
|
301 /* make the clone's pointers point to its own memory */ |
|
302 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); |
|
303 if(other->data16!=NULL) { |
|
304 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); |
|
305 } |
|
306 if(other->data32!=NULL) { |
|
307 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); |
|
308 } |
|
309 } |
|
310 } else /* other->newTrie!=NULL */ { |
|
311 trie->newTrie=cloneBuilder(other->newTrie); |
|
312 } |
|
313 |
|
314 if(trie->memory==NULL && trie->newTrie==NULL) { |
|
315 uprv_free(trie); |
|
316 trie=NULL; |
|
317 } |
|
318 return trie; |
|
319 } |
|
320 |
|
321 typedef struct NewTrieAndStatus { |
|
322 UTrie2 *trie; |
|
323 UErrorCode errorCode; |
|
324 UBool exclusiveLimit; /* rather than inclusive range end */ |
|
325 } NewTrieAndStatus; |
|
326 |
|
327 static UBool U_CALLCONV |
|
328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
|
329 NewTrieAndStatus *nt=(NewTrieAndStatus *)context; |
|
330 if(value!=nt->trie->initialValue) { |
|
331 if(nt->exclusiveLimit) { |
|
332 --end; |
|
333 } |
|
334 if(start==end) { |
|
335 utrie2_set32(nt->trie, start, value, &nt->errorCode); |
|
336 } else { |
|
337 utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); |
|
338 } |
|
339 return U_SUCCESS(nt->errorCode); |
|
340 } else { |
|
341 return TRUE; |
|
342 } |
|
343 } |
|
344 |
|
345 #ifdef UTRIE2_DEBUG |
|
346 static void |
|
347 utrie_printLengths(const UTrie *trie) { |
|
348 long indexLength=trie->indexLength; |
|
349 long dataLength=(long)trie->dataLength; |
|
350 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); |
|
351 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", |
|
352 indexLength, dataLength, totalLength); |
|
353 } |
|
354 |
|
355 static void |
|
356 utrie2_printLengths(const UTrie2 *trie, const char *which) { |
|
357 long indexLength=trie->indexLength; |
|
358 long dataLength=(long)trie->dataLength; |
|
359 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); |
|
360 printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", |
|
361 which, indexLength, dataLength, totalLength); |
|
362 } |
|
363 #endif |
|
364 |
|
365 U_CAPI UTrie2 * U_EXPORT2 |
|
366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { |
|
367 NewTrieAndStatus context; |
|
368 UChar lead; |
|
369 |
|
370 if(U_FAILURE(*pErrorCode)) { |
|
371 return NULL; |
|
372 } |
|
373 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
|
374 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
375 return NULL; |
|
376 } |
|
377 if(other->newTrie!=NULL && !other->newTrie->isCompacted) { |
|
378 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ |
|
379 } |
|
380 |
|
381 /* Clone the frozen trie by enumerating it and building a new one. */ |
|
382 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); |
|
383 if(U_FAILURE(*pErrorCode)) { |
|
384 return NULL; |
|
385 } |
|
386 context.exclusiveLimit=FALSE; |
|
387 context.errorCode=*pErrorCode; |
|
388 utrie2_enum(other, NULL, copyEnumRange, &context); |
|
389 *pErrorCode=context.errorCode; |
|
390 for(lead=0xd800; lead<0xdc00; ++lead) { |
|
391 uint32_t value; |
|
392 if(other->data32==NULL) { |
|
393 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); |
|
394 } else { |
|
395 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); |
|
396 } |
|
397 if(value!=other->initialValue) { |
|
398 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); |
|
399 } |
|
400 } |
|
401 if(U_FAILURE(*pErrorCode)) { |
|
402 utrie2_close(context.trie); |
|
403 context.trie=NULL; |
|
404 } |
|
405 return context.trie; |
|
406 } |
|
407 |
|
408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ |
|
409 U_CAPI UTrie2 * U_EXPORT2 |
|
410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { |
|
411 NewTrieAndStatus context; |
|
412 UChar lead; |
|
413 |
|
414 if(U_FAILURE(*pErrorCode)) { |
|
415 return NULL; |
|
416 } |
|
417 if(trie1==NULL) { |
|
418 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
419 return NULL; |
|
420 } |
|
421 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); |
|
422 if(U_FAILURE(*pErrorCode)) { |
|
423 return NULL; |
|
424 } |
|
425 context.exclusiveLimit=TRUE; |
|
426 context.errorCode=*pErrorCode; |
|
427 utrie_enum(trie1, NULL, copyEnumRange, &context); |
|
428 *pErrorCode=context.errorCode; |
|
429 for(lead=0xd800; lead<0xdc00; ++lead) { |
|
430 uint32_t value; |
|
431 if(trie1->data32==NULL) { |
|
432 value=UTRIE_GET16_FROM_LEAD(trie1, lead); |
|
433 } else { |
|
434 value=UTRIE_GET32_FROM_LEAD(trie1, lead); |
|
435 } |
|
436 if(value!=trie1->initialValue) { |
|
437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); |
|
438 } |
|
439 } |
|
440 if(U_SUCCESS(*pErrorCode)) { |
|
441 utrie2_freeze(context.trie, |
|
442 trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, |
|
443 pErrorCode); |
|
444 } |
|
445 #ifdef UTRIE2_DEBUG |
|
446 if(U_SUCCESS(*pErrorCode)) { |
|
447 utrie_printLengths(trie1); |
|
448 utrie2_printLengths(context.trie, "fromUTrie"); |
|
449 } |
|
450 #endif |
|
451 if(U_FAILURE(*pErrorCode)) { |
|
452 utrie2_close(context.trie); |
|
453 context.trie=NULL; |
|
454 } |
|
455 return context.trie; |
|
456 } |
|
457 |
|
458 static inline UBool |
|
459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
|
460 int32_t i2, block; |
|
461 |
|
462 if(U_IS_LEAD(c) && forLSCP) { |
|
463 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ |
|
464 (c>>UTRIE2_SHIFT_2); |
|
465 } else { |
|
466 i2=trie->index1[c>>UTRIE2_SHIFT_1]+ |
|
467 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); |
|
468 } |
|
469 block=trie->index2[i2]; |
|
470 return (UBool)(block==trie->dataNullOffset); |
|
471 } |
|
472 |
|
473 static int32_t |
|
474 allocIndex2Block(UNewTrie2 *trie) { |
|
475 int32_t newBlock, newTop; |
|
476 |
|
477 newBlock=trie->index2Length; |
|
478 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; |
|
479 if(newTop>LENGTHOF(trie->index2)) { |
|
480 /* |
|
481 * Should never occur. |
|
482 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, |
|
483 * or the code writes more values than should be possible. |
|
484 */ |
|
485 return -1; |
|
486 } |
|
487 trie->index2Length=newTop; |
|
488 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); |
|
489 return newBlock; |
|
490 } |
|
491 |
|
492 static int32_t |
|
493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
|
494 int32_t i1, i2; |
|
495 |
|
496 if(U_IS_LEAD(c) && forLSCP) { |
|
497 return UTRIE2_LSCP_INDEX_2_OFFSET; |
|
498 } |
|
499 |
|
500 i1=c>>UTRIE2_SHIFT_1; |
|
501 i2=trie->index1[i1]; |
|
502 if(i2==trie->index2NullOffset) { |
|
503 i2=allocIndex2Block(trie); |
|
504 if(i2<0) { |
|
505 return -1; /* program error */ |
|
506 } |
|
507 trie->index1[i1]=i2; |
|
508 } |
|
509 return i2; |
|
510 } |
|
511 |
|
512 static int32_t |
|
513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { |
|
514 int32_t newBlock, newTop; |
|
515 |
|
516 if(trie->firstFreeBlock!=0) { |
|
517 /* get the first free block */ |
|
518 newBlock=trie->firstFreeBlock; |
|
519 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; |
|
520 } else { |
|
521 /* get a new block from the high end */ |
|
522 newBlock=trie->dataLength; |
|
523 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; |
|
524 if(newTop>trie->dataCapacity) { |
|
525 /* out of memory in the data array */ |
|
526 int32_t capacity; |
|
527 uint32_t *data; |
|
528 |
|
529 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { |
|
530 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; |
|
531 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { |
|
532 capacity=UNEWTRIE2_MAX_DATA_LENGTH; |
|
533 } else { |
|
534 /* |
|
535 * Should never occur. |
|
536 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, |
|
537 * or the code writes more values than should be possible. |
|
538 */ |
|
539 return -1; |
|
540 } |
|
541 data=(uint32_t *)uprv_malloc(capacity*4); |
|
542 if(data==NULL) { |
|
543 return -1; |
|
544 } |
|
545 uprv_memcpy(data, trie->data, trie->dataLength*4); |
|
546 uprv_free(trie->data); |
|
547 trie->data=data; |
|
548 trie->dataCapacity=capacity; |
|
549 } |
|
550 trie->dataLength=newTop; |
|
551 } |
|
552 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); |
|
553 trie->map[newBlock>>UTRIE2_SHIFT_2]=0; |
|
554 return newBlock; |
|
555 } |
|
556 |
|
557 /* call when the block's reference counter reaches 0 */ |
|
558 static void |
|
559 releaseDataBlock(UNewTrie2 *trie, int32_t block) { |
|
560 /* put this block at the front of the free-block chain */ |
|
561 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; |
|
562 trie->firstFreeBlock=block; |
|
563 } |
|
564 |
|
565 static inline UBool |
|
566 isWritableBlock(UNewTrie2 *trie, int32_t block) { |
|
567 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); |
|
568 } |
|
569 |
|
570 static inline void |
|
571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { |
|
572 int32_t oldBlock; |
|
573 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ |
|
574 oldBlock=trie->index2[i2]; |
|
575 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { |
|
576 releaseDataBlock(trie, oldBlock); |
|
577 } |
|
578 trie->index2[i2]=block; |
|
579 } |
|
580 |
|
581 /** |
|
582 * No error checking for illegal arguments. |
|
583 * |
|
584 * @return -1 if no new data block available (out of memory in data array) |
|
585 * @internal |
|
586 */ |
|
587 static int32_t |
|
588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
|
589 int32_t i2, oldBlock, newBlock; |
|
590 |
|
591 i2=getIndex2Block(trie, c, forLSCP); |
|
592 if(i2<0) { |
|
593 return -1; /* program error */ |
|
594 } |
|
595 |
|
596 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
|
597 oldBlock=trie->index2[i2]; |
|
598 if(isWritableBlock(trie, oldBlock)) { |
|
599 return oldBlock; |
|
600 } |
|
601 |
|
602 /* allocate a new data block */ |
|
603 newBlock=allocDataBlock(trie, oldBlock); |
|
604 if(newBlock<0) { |
|
605 /* out of memory in the data array */ |
|
606 return -1; |
|
607 } |
|
608 setIndex2Entry(trie, i2, newBlock); |
|
609 return newBlock; |
|
610 } |
|
611 |
|
612 /** |
|
613 * @return TRUE if the value was successfully set |
|
614 */ |
|
615 static void |
|
616 set32(UNewTrie2 *trie, |
|
617 UChar32 c, UBool forLSCP, uint32_t value, |
|
618 UErrorCode *pErrorCode) { |
|
619 int32_t block; |
|
620 |
|
621 if(trie==NULL || trie->isCompacted) { |
|
622 *pErrorCode=U_NO_WRITE_PERMISSION; |
|
623 return; |
|
624 } |
|
625 |
|
626 block=getDataBlock(trie, c, forLSCP); |
|
627 if(block<0) { |
|
628 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
629 return; |
|
630 } |
|
631 |
|
632 trie->data[block+(c&UTRIE2_DATA_MASK)]=value; |
|
633 } |
|
634 |
|
635 U_CAPI void U_EXPORT2 |
|
636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { |
|
637 if(U_FAILURE(*pErrorCode)) { |
|
638 return; |
|
639 } |
|
640 if((uint32_t)c>0x10ffff) { |
|
641 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
642 return; |
|
643 } |
|
644 set32(trie->newTrie, c, TRUE, value, pErrorCode); |
|
645 } |
|
646 |
|
647 U_CAPI void U_EXPORT2 |
|
648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, |
|
649 UChar32 c, uint32_t value, |
|
650 UErrorCode *pErrorCode) { |
|
651 if(U_FAILURE(*pErrorCode)) { |
|
652 return; |
|
653 } |
|
654 if(!U_IS_LEAD(c)) { |
|
655 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
656 return; |
|
657 } |
|
658 set32(trie->newTrie, c, FALSE, value, pErrorCode); |
|
659 } |
|
660 |
|
661 static void |
|
662 writeBlock(uint32_t *block, uint32_t value) { |
|
663 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; |
|
664 while(block<limit) { |
|
665 *block++=value; |
|
666 } |
|
667 } |
|
668 |
|
669 /** |
|
670 * initialValue is ignored if overwrite=TRUE |
|
671 * @internal |
|
672 */ |
|
673 static void |
|
674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit, |
|
675 uint32_t value, uint32_t initialValue, UBool overwrite) { |
|
676 uint32_t *pLimit; |
|
677 |
|
678 pLimit=block+limit; |
|
679 block+=start; |
|
680 if(overwrite) { |
|
681 while(block<pLimit) { |
|
682 *block++=value; |
|
683 } |
|
684 } else { |
|
685 while(block<pLimit) { |
|
686 if(*block==initialValue) { |
|
687 *block=value; |
|
688 } |
|
689 ++block; |
|
690 } |
|
691 } |
|
692 } |
|
693 |
|
694 U_CAPI void U_EXPORT2 |
|
695 utrie2_setRange32(UTrie2 *trie, |
|
696 UChar32 start, UChar32 end, |
|
697 uint32_t value, UBool overwrite, |
|
698 UErrorCode *pErrorCode) { |
|
699 /* |
|
700 * repeat value in [start..end] |
|
701 * mark index values for repeat-data blocks by setting bit 31 of the index values |
|
702 * fill around existing values if any, if(overwrite) |
|
703 */ |
|
704 UNewTrie2 *newTrie; |
|
705 int32_t block, rest, repeatBlock; |
|
706 UChar32 limit; |
|
707 |
|
708 if(U_FAILURE(*pErrorCode)) { |
|
709 return; |
|
710 } |
|
711 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { |
|
712 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
713 return; |
|
714 } |
|
715 newTrie=trie->newTrie; |
|
716 if(newTrie==NULL || newTrie->isCompacted) { |
|
717 *pErrorCode=U_NO_WRITE_PERMISSION; |
|
718 return; |
|
719 } |
|
720 if(!overwrite && value==newTrie->initialValue) { |
|
721 return; /* nothing to do */ |
|
722 } |
|
723 |
|
724 limit=end+1; |
|
725 if(start&UTRIE2_DATA_MASK) { |
|
726 UChar32 nextStart; |
|
727 |
|
728 /* set partial block at [start..following block boundary[ */ |
|
729 block=getDataBlock(newTrie, start, TRUE); |
|
730 if(block<0) { |
|
731 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
732 return; |
|
733 } |
|
734 |
|
735 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; |
|
736 if(nextStart<=limit) { |
|
737 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, |
|
738 value, newTrie->initialValue, overwrite); |
|
739 start=nextStart; |
|
740 } else { |
|
741 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, |
|
742 value, newTrie->initialValue, overwrite); |
|
743 return; |
|
744 } |
|
745 } |
|
746 |
|
747 /* number of positions in the last, partial block */ |
|
748 rest=limit&UTRIE2_DATA_MASK; |
|
749 |
|
750 /* round down limit to a block boundary */ |
|
751 limit&=~UTRIE2_DATA_MASK; |
|
752 |
|
753 /* iterate over all-value blocks */ |
|
754 if(value==newTrie->initialValue) { |
|
755 repeatBlock=newTrie->dataNullOffset; |
|
756 } else { |
|
757 repeatBlock=-1; |
|
758 } |
|
759 |
|
760 while(start<limit) { |
|
761 int32_t i2; |
|
762 UBool setRepeatBlock=FALSE; |
|
763 |
|
764 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) { |
|
765 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ |
|
766 continue; |
|
767 } |
|
768 |
|
769 /* get index value */ |
|
770 i2=getIndex2Block(newTrie, start, TRUE); |
|
771 if(i2<0) { |
|
772 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
|
773 return; |
|
774 } |
|
775 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
|
776 block=newTrie->index2[i2]; |
|
777 if(isWritableBlock(newTrie, block)) { |
|
778 /* already allocated */ |
|
779 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { |
|
780 /* |
|
781 * We overwrite all values, and it's not a |
|
782 * protected (ASCII-linear or 2-byte UTF-8) block: |
|
783 * replace with the repeatBlock. |
|
784 */ |
|
785 setRepeatBlock=TRUE; |
|
786 } else { |
|
787 /* !overwrite, or protected block: just write the values into this block */ |
|
788 fillBlock(newTrie->data+block, |
|
789 0, UTRIE2_DATA_BLOCK_LENGTH, |
|
790 value, newTrie->initialValue, overwrite); |
|
791 } |
|
792 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { |
|
793 /* |
|
794 * Set the repeatBlock instead of the null block or previous repeat block: |
|
795 * |
|
796 * If !isWritableBlock() then all entries in the block have the same value |
|
797 * because it's the null block or a range block (the repeatBlock from a previous |
|
798 * call to utrie2_setRange32()). |
|
799 * No other blocks are used multiple times before compacting. |
|
800 * |
|
801 * The null block is the only non-writable block with the initialValue because |
|
802 * of the repeatBlock initialization above. (If value==initialValue, then |
|
803 * the repeatBlock will be the null data block.) |
|
804 * |
|
805 * We set our repeatBlock if the desired value differs from the block's value, |
|
806 * and if we overwrite any data or if the data is all initial values |
|
807 * (which is the same as the block being the null block, see above). |
|
808 */ |
|
809 setRepeatBlock=TRUE; |
|
810 } |
|
811 if(setRepeatBlock) { |
|
812 if(repeatBlock>=0) { |
|
813 setIndex2Entry(newTrie, i2, repeatBlock); |
|
814 } else { |
|
815 /* create and set and fill the repeatBlock */ |
|
816 repeatBlock=getDataBlock(newTrie, start, TRUE); |
|
817 if(repeatBlock<0) { |
|
818 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
819 return; |
|
820 } |
|
821 writeBlock(newTrie->data+repeatBlock, value); |
|
822 } |
|
823 } |
|
824 |
|
825 start+=UTRIE2_DATA_BLOCK_LENGTH; |
|
826 } |
|
827 |
|
828 if(rest>0) { |
|
829 /* set partial block at [last block boundary..limit[ */ |
|
830 block=getDataBlock(newTrie, start, TRUE); |
|
831 if(block<0) { |
|
832 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
833 return; |
|
834 } |
|
835 |
|
836 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); |
|
837 } |
|
838 |
|
839 return; |
|
840 } |
|
841 |
|
842 /* compaction --------------------------------------------------------------- */ |
|
843 |
|
844 static inline UBool |
|
845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) { |
|
846 while(length>0 && *s==*t) { |
|
847 ++s; |
|
848 ++t; |
|
849 --length; |
|
850 } |
|
851 return (UBool)(length==0); |
|
852 } |
|
853 |
|
854 static inline UBool |
|
855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { |
|
856 while(length>0 && *s==*t) { |
|
857 ++s; |
|
858 ++t; |
|
859 --length; |
|
860 } |
|
861 return (UBool)(length==0); |
|
862 } |
|
863 |
|
864 static int32_t |
|
865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { |
|
866 int32_t block; |
|
867 |
|
868 /* ensure that we do not even partially get past index2Length */ |
|
869 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; |
|
870 |
|
871 for(block=0; block<=index2Length; ++block) { |
|
872 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { |
|
873 return block; |
|
874 } |
|
875 } |
|
876 return -1; |
|
877 } |
|
878 |
|
879 static int32_t |
|
880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { |
|
881 int32_t block; |
|
882 |
|
883 /* ensure that we do not even partially get past dataLength */ |
|
884 dataLength-=blockLength; |
|
885 |
|
886 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { |
|
887 if(equal_uint32(data+block, data+otherBlock, blockLength)) { |
|
888 return block; |
|
889 } |
|
890 } |
|
891 return -1; |
|
892 } |
|
893 |
|
894 /* |
|
895 * Find the start of the last range in the trie by enumerating backward. |
|
896 * Indexes for supplementary code points higher than this will be omitted. |
|
897 */ |
|
898 static UChar32 |
|
899 findHighStart(UNewTrie2 *trie, uint32_t highValue) { |
|
900 const uint32_t *data32; |
|
901 |
|
902 uint32_t value, initialValue; |
|
903 UChar32 c, prev; |
|
904 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; |
|
905 |
|
906 data32=trie->data; |
|
907 initialValue=trie->initialValue; |
|
908 |
|
909 index2NullOffset=trie->index2NullOffset; |
|
910 nullBlock=trie->dataNullOffset; |
|
911 |
|
912 /* set variables for previous range */ |
|
913 if(highValue==initialValue) { |
|
914 prevI2Block=index2NullOffset; |
|
915 prevBlock=nullBlock; |
|
916 } else { |
|
917 prevI2Block=-1; |
|
918 prevBlock=-1; |
|
919 } |
|
920 prev=0x110000; |
|
921 |
|
922 /* enumerate index-2 blocks */ |
|
923 i1=UNEWTRIE2_INDEX_1_LENGTH; |
|
924 c=prev; |
|
925 while(c>0) { |
|
926 i2Block=trie->index1[--i1]; |
|
927 if(i2Block==prevI2Block) { |
|
928 /* the index-2 block is the same as the previous one, and filled with highValue */ |
|
929 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
|
930 continue; |
|
931 } |
|
932 prevI2Block=i2Block; |
|
933 if(i2Block==index2NullOffset) { |
|
934 /* this is the null index-2 block */ |
|
935 if(highValue!=initialValue) { |
|
936 return c; |
|
937 } |
|
938 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
|
939 } else { |
|
940 /* enumerate data blocks for one index-2 block */ |
|
941 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { |
|
942 block=trie->index2[i2Block+ --i2]; |
|
943 if(block==prevBlock) { |
|
944 /* the block is the same as the previous one, and filled with highValue */ |
|
945 c-=UTRIE2_DATA_BLOCK_LENGTH; |
|
946 continue; |
|
947 } |
|
948 prevBlock=block; |
|
949 if(block==nullBlock) { |
|
950 /* this is the null data block */ |
|
951 if(highValue!=initialValue) { |
|
952 return c; |
|
953 } |
|
954 c-=UTRIE2_DATA_BLOCK_LENGTH; |
|
955 } else { |
|
956 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { |
|
957 value=data32[block+ --j]; |
|
958 if(value!=highValue) { |
|
959 return c; |
|
960 } |
|
961 --c; |
|
962 } |
|
963 } |
|
964 } |
|
965 } |
|
966 } |
|
967 |
|
968 /* deliver last range */ |
|
969 return 0; |
|
970 } |
|
971 |
|
972 /* |
|
973 * Compact a build-time trie. |
|
974 * |
|
975 * The compaction |
|
976 * - removes blocks that are identical with earlier ones |
|
977 * - overlaps adjacent blocks as much as possible (if overlap==TRUE) |
|
978 * - moves blocks in steps of the data granularity |
|
979 * - moves and overlaps blocks that overlap with multiple values in the overlap region |
|
980 * |
|
981 * It does not |
|
982 * - try to move and overlap blocks that are not already adjacent |
|
983 */ |
|
984 static void |
|
985 compactData(UNewTrie2 *trie) { |
|
986 int32_t start, newStart, movedStart; |
|
987 int32_t blockLength, overlap; |
|
988 int32_t i, mapIndex, blockCount; |
|
989 |
|
990 /* do not compact linear-ASCII data */ |
|
991 newStart=UTRIE2_DATA_START_OFFSET; |
|
992 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { |
|
993 trie->map[i]=start; |
|
994 } |
|
995 |
|
996 /* |
|
997 * Start with a block length of 64 for 2-byte UTF-8, |
|
998 * then switch to UTRIE2_DATA_BLOCK_LENGTH. |
|
999 */ |
|
1000 blockLength=64; |
|
1001 blockCount=blockLength>>UTRIE2_SHIFT_2; |
|
1002 for(start=newStart; start<trie->dataLength;) { |
|
1003 /* |
|
1004 * start: index of first entry of current block |
|
1005 * newStart: index where the current block is to be moved |
|
1006 * (right after current end of already-compacted data) |
|
1007 */ |
|
1008 if(start==UNEWTRIE2_DATA_0800_OFFSET) { |
|
1009 blockLength=UTRIE2_DATA_BLOCK_LENGTH; |
|
1010 blockCount=1; |
|
1011 } |
|
1012 |
|
1013 /* skip blocks that are not used */ |
|
1014 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { |
|
1015 /* advance start to the next block */ |
|
1016 start+=blockLength; |
|
1017 |
|
1018 /* leave newStart with the previous block! */ |
|
1019 continue; |
|
1020 } |
|
1021 |
|
1022 /* search for an identical block */ |
|
1023 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) |
|
1024 >=0 |
|
1025 ) { |
|
1026 /* found an identical block, set the other block's index value for the current block */ |
|
1027 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
|
1028 trie->map[mapIndex++]=movedStart; |
|
1029 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
|
1030 } |
|
1031 |
|
1032 /* advance start to the next block */ |
|
1033 start+=blockLength; |
|
1034 |
|
1035 /* leave newStart with the previous block! */ |
|
1036 continue; |
|
1037 } |
|
1038 |
|
1039 /* see if the beginning of this block can be overlapped with the end of the previous block */ |
|
1040 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ |
|
1041 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; |
|
1042 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); |
|
1043 overlap-=UTRIE2_DATA_GRANULARITY) {} |
|
1044 |
|
1045 if(overlap>0 || newStart<start) { |
|
1046 /* some overlap, or just move the whole block */ |
|
1047 movedStart=newStart-overlap; |
|
1048 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
|
1049 trie->map[mapIndex++]=movedStart; |
|
1050 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
|
1051 } |
|
1052 |
|
1053 /* move the non-overlapping indexes to their new positions */ |
|
1054 start+=overlap; |
|
1055 for(i=blockLength-overlap; i>0; --i) { |
|
1056 trie->data[newStart++]=trie->data[start++]; |
|
1057 } |
|
1058 } else /* no overlap && newStart==start */ { |
|
1059 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
|
1060 trie->map[mapIndex++]=start; |
|
1061 start+=UTRIE2_DATA_BLOCK_LENGTH; |
|
1062 } |
|
1063 newStart=start; |
|
1064 } |
|
1065 } |
|
1066 |
|
1067 /* now adjust the index-2 table */ |
|
1068 for(i=0; i<trie->index2Length; ++i) { |
|
1069 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { |
|
1070 /* Gap indexes are invalid (-1). Skip over the gap. */ |
|
1071 i+=UNEWTRIE2_INDEX_GAP_LENGTH; |
|
1072 } |
|
1073 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; |
|
1074 } |
|
1075 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; |
|
1076 |
|
1077 /* ensure dataLength alignment */ |
|
1078 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
|
1079 trie->data[newStart++]=trie->initialValue; |
|
1080 } |
|
1081 |
|
1082 #ifdef UTRIE2_DEBUG |
|
1083 /* we saved some space */ |
|
1084 printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", |
|
1085 (long)trie->dataLength, (long)newStart); |
|
1086 #endif |
|
1087 |
|
1088 trie->dataLength=newStart; |
|
1089 } |
|
1090 |
|
1091 static void |
|
1092 compactIndex2(UNewTrie2 *trie) { |
|
1093 int32_t i, start, newStart, movedStart, overlap; |
|
1094 |
|
1095 /* do not compact linear-BMP index-2 blocks */ |
|
1096 newStart=UTRIE2_INDEX_2_BMP_LENGTH; |
|
1097 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { |
|
1098 trie->map[i]=start; |
|
1099 } |
|
1100 |
|
1101 /* Reduce the index table gap to what will be needed at runtime. */ |
|
1102 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); |
|
1103 |
|
1104 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { |
|
1105 /* |
|
1106 * start: index of first entry of current block |
|
1107 * newStart: index where the current block is to be moved |
|
1108 * (right after current end of already-compacted data) |
|
1109 */ |
|
1110 |
|
1111 /* search for an identical block */ |
|
1112 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) |
|
1113 >=0 |
|
1114 ) { |
|
1115 /* found an identical block, set the other block's index value for the current block */ |
|
1116 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; |
|
1117 |
|
1118 /* advance start to the next block */ |
|
1119 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
|
1120 |
|
1121 /* leave newStart with the previous block! */ |
|
1122 continue; |
|
1123 } |
|
1124 |
|
1125 /* see if the beginning of this block can be overlapped with the end of the previous block */ |
|
1126 /* look for maximum overlap with the previous, adjacent block */ |
|
1127 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; |
|
1128 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); |
|
1129 --overlap) {} |
|
1130 |
|
1131 if(overlap>0 || newStart<start) { |
|
1132 /* some overlap, or just move the whole block */ |
|
1133 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; |
|
1134 |
|
1135 /* move the non-overlapping indexes to their new positions */ |
|
1136 start+=overlap; |
|
1137 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { |
|
1138 trie->index2[newStart++]=trie->index2[start++]; |
|
1139 } |
|
1140 } else /* no overlap && newStart==start */ { |
|
1141 trie->map[start>>UTRIE2_SHIFT_1_2]=start; |
|
1142 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
|
1143 newStart=start; |
|
1144 } |
|
1145 } |
|
1146 |
|
1147 /* now adjust the index-1 table */ |
|
1148 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
|
1149 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; |
|
1150 } |
|
1151 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; |
|
1152 |
|
1153 /* |
|
1154 * Ensure data table alignment: |
|
1155 * Needs to be granularity-aligned for 16-bit trie |
|
1156 * (so that dataMove will be down-shiftable), |
|
1157 * and 2-aligned for uint32_t data. |
|
1158 */ |
|
1159 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { |
|
1160 /* Arbitrary value: 0x3fffc not possible for real data. */ |
|
1161 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; |
|
1162 } |
|
1163 |
|
1164 #ifdef UTRIE2_DEBUG |
|
1165 /* we saved some space */ |
|
1166 printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", |
|
1167 (long)trie->index2Length, (long)newStart); |
|
1168 #endif |
|
1169 |
|
1170 trie->index2Length=newStart; |
|
1171 } |
|
1172 |
|
1173 static void |
|
1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { |
|
1175 UNewTrie2 *newTrie; |
|
1176 UChar32 highStart, suppHighStart; |
|
1177 uint32_t highValue; |
|
1178 |
|
1179 newTrie=trie->newTrie; |
|
1180 |
|
1181 /* find highStart and round it up */ |
|
1182 highValue=utrie2_get32(trie, 0x10ffff); |
|
1183 highStart=findHighStart(newTrie, highValue); |
|
1184 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); |
|
1185 if(highStart==0x110000) { |
|
1186 highValue=trie->errorValue; |
|
1187 } |
|
1188 |
|
1189 /* |
|
1190 * Set trie->highStart only after utrie2_get32(trie, highStart). |
|
1191 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. |
|
1192 */ |
|
1193 trie->highStart=newTrie->highStart=highStart; |
|
1194 |
|
1195 #ifdef UTRIE2_DEBUG |
|
1196 printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", |
|
1197 (long)highStart, (long)highValue, (long)trie->initialValue); |
|
1198 #endif |
|
1199 |
|
1200 if(highStart<0x110000) { |
|
1201 /* Blank out [highStart..10ffff] to release associated data blocks. */ |
|
1202 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; |
|
1203 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); |
|
1204 if(U_FAILURE(*pErrorCode)) { |
|
1205 return; |
|
1206 } |
|
1207 } |
|
1208 |
|
1209 compactData(newTrie); |
|
1210 if(highStart>0x10000) { |
|
1211 compactIndex2(newTrie); |
|
1212 #ifdef UTRIE2_DEBUG |
|
1213 } else { |
|
1214 printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", |
|
1215 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); |
|
1216 #endif |
|
1217 } |
|
1218 |
|
1219 /* |
|
1220 * Store the highValue in the data array and round up the dataLength. |
|
1221 * Must be done after compactData() because that assumes that dataLength |
|
1222 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. |
|
1223 */ |
|
1224 newTrie->data[newTrie->dataLength++]=highValue; |
|
1225 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
|
1226 newTrie->data[newTrie->dataLength++]=trie->initialValue; |
|
1227 } |
|
1228 |
|
1229 newTrie->isCompacted=TRUE; |
|
1230 } |
|
1231 |
|
1232 /* serialization ------------------------------------------------------------ */ |
|
1233 |
|
1234 /** |
|
1235 * Maximum length of the runtime index array. |
|
1236 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. |
|
1237 * (The actual maximum length is lower, |
|
1238 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) |
|
1239 */ |
|
1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff |
|
1241 |
|
1242 /** |
|
1243 * Maximum length of the runtime data array. |
|
1244 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, |
|
1245 * and by uint16_t UTrie2Header.shiftedDataLength. |
|
1246 */ |
|
1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) |
|
1248 |
|
1249 /* Compact and internally serialize the trie. */ |
|
1250 U_CAPI void U_EXPORT2 |
|
1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { |
|
1252 UNewTrie2 *newTrie; |
|
1253 UTrie2Header *header; |
|
1254 uint32_t *p; |
|
1255 uint16_t *dest16; |
|
1256 int32_t i, length; |
|
1257 int32_t allIndexesLength; |
|
1258 int32_t dataMove; /* >0 if the data is moved to the end of the index array */ |
|
1259 UChar32 highStart; |
|
1260 |
|
1261 /* argument check */ |
|
1262 if(U_FAILURE(*pErrorCode)) { |
|
1263 return; |
|
1264 } |
|
1265 if( trie==NULL || |
|
1266 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits |
|
1267 ) { |
|
1268 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1269 return; |
|
1270 } |
|
1271 newTrie=trie->newTrie; |
|
1272 if(newTrie==NULL) { |
|
1273 /* already frozen */ |
|
1274 UTrie2ValueBits frozenValueBits= |
|
1275 trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; |
|
1276 if(valueBits!=frozenValueBits) { |
|
1277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1278 } |
|
1279 return; |
|
1280 } |
|
1281 |
|
1282 /* compact if necessary */ |
|
1283 if(!newTrie->isCompacted) { |
|
1284 compactTrie(trie, pErrorCode); |
|
1285 if(U_FAILURE(*pErrorCode)) { |
|
1286 return; |
|
1287 } |
|
1288 } |
|
1289 highStart=trie->highStart; |
|
1290 |
|
1291 if(highStart<=0x10000) { |
|
1292 allIndexesLength=UTRIE2_INDEX_1_OFFSET; |
|
1293 } else { |
|
1294 allIndexesLength=newTrie->index2Length; |
|
1295 } |
|
1296 if(valueBits==UTRIE2_16_VALUE_BITS) { |
|
1297 dataMove=allIndexesLength; |
|
1298 } else { |
|
1299 dataMove=0; |
|
1300 } |
|
1301 |
|
1302 /* are indexLength and dataLength within limits? */ |
|
1303 if( /* for unshifted indexLength */ |
|
1304 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || |
|
1305 /* for unshifted dataNullOffset */ |
|
1306 (dataMove+newTrie->dataNullOffset)>0xffff || |
|
1307 /* for unshifted 2-byte UTF-8 index-2 values */ |
|
1308 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || |
|
1309 /* for shiftedDataLength */ |
|
1310 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH |
|
1311 ) { |
|
1312 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
1313 return; |
|
1314 } |
|
1315 |
|
1316 /* calculate the total serialized length */ |
|
1317 length=sizeof(UTrie2Header)+allIndexesLength*2; |
|
1318 if(valueBits==UTRIE2_16_VALUE_BITS) { |
|
1319 length+=newTrie->dataLength*2; |
|
1320 } else { |
|
1321 length+=newTrie->dataLength*4; |
|
1322 } |
|
1323 |
|
1324 trie->memory=uprv_malloc(length); |
|
1325 if(trie->memory==NULL) { |
|
1326 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
1327 return; |
|
1328 } |
|
1329 trie->length=length; |
|
1330 trie->isMemoryOwned=TRUE; |
|
1331 |
|
1332 trie->indexLength=allIndexesLength; |
|
1333 trie->dataLength=newTrie->dataLength; |
|
1334 if(highStart<=0x10000) { |
|
1335 trie->index2NullOffset=0xffff; |
|
1336 } else { |
|
1337 trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; |
|
1338 } |
|
1339 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); |
|
1340 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; |
|
1341 |
|
1342 /* set the header fields */ |
|
1343 header=(UTrie2Header *)trie->memory; |
|
1344 |
|
1345 header->signature=UTRIE2_SIG; /* "Tri2" */ |
|
1346 header->options=(uint16_t)valueBits; |
|
1347 |
|
1348 header->indexLength=(uint16_t)trie->indexLength; |
|
1349 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); |
|
1350 header->index2NullOffset=trie->index2NullOffset; |
|
1351 header->dataNullOffset=trie->dataNullOffset; |
|
1352 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); |
|
1353 |
|
1354 /* fill the index and data arrays */ |
|
1355 dest16=(uint16_t *)(header+1); |
|
1356 trie->index=dest16; |
|
1357 |
|
1358 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ |
|
1359 p=(uint32_t *)newTrie->index2; |
|
1360 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { |
|
1361 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
|
1362 } |
|
1363 |
|
1364 /* write UTF-8 2-byte index-2 values, not right-shifted */ |
|
1365 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ |
|
1366 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); |
|
1367 } |
|
1368 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ |
|
1369 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); |
|
1370 } |
|
1371 |
|
1372 if(highStart>0x10000) { |
|
1373 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; |
|
1374 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; |
|
1375 |
|
1376 /* write 16-bit index-1 values for supplementary code points */ |
|
1377 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
|
1378 for(i=index1Length; i>0; --i) { |
|
1379 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); |
|
1380 } |
|
1381 |
|
1382 /* |
|
1383 * write the index-2 array values for supplementary code points, |
|
1384 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove |
|
1385 */ |
|
1386 p=(uint32_t *)newTrie->index2+index2Offset; |
|
1387 for(i=newTrie->index2Length-index2Offset; i>0; --i) { |
|
1388 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
|
1389 } |
|
1390 } |
|
1391 |
|
1392 /* write the 16/32-bit data array */ |
|
1393 switch(valueBits) { |
|
1394 case UTRIE2_16_VALUE_BITS: |
|
1395 /* write 16-bit data values */ |
|
1396 trie->data16=dest16; |
|
1397 trie->data32=NULL; |
|
1398 p=newTrie->data; |
|
1399 for(i=newTrie->dataLength; i>0; --i) { |
|
1400 *dest16++=(uint16_t)*p++; |
|
1401 } |
|
1402 break; |
|
1403 case UTRIE2_32_VALUE_BITS: |
|
1404 /* write 32-bit data values */ |
|
1405 trie->data16=NULL; |
|
1406 trie->data32=(uint32_t *)dest16; |
|
1407 uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); |
|
1408 break; |
|
1409 default: |
|
1410 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1411 return; |
|
1412 } |
|
1413 |
|
1414 /* Delete the UNewTrie2. */ |
|
1415 uprv_free(newTrie->data); |
|
1416 uprv_free(newTrie); |
|
1417 trie->newTrie=NULL; |
|
1418 } |
|
1419 |
|
1420 U_CAPI UBool U_EXPORT2 |
|
1421 utrie2_isFrozen(const UTrie2 *trie) { |
|
1422 return (UBool)(trie->newTrie==NULL); |
|
1423 } |
|
1424 |
|
1425 U_CAPI int32_t U_EXPORT2 |
|
1426 utrie2_serialize(UTrie2 *trie, |
|
1427 void *data, int32_t capacity, |
|
1428 UErrorCode *pErrorCode) { |
|
1429 /* argument check */ |
|
1430 if(U_FAILURE(*pErrorCode)) { |
|
1431 return 0; |
|
1432 } |
|
1433 |
|
1434 if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || |
|
1435 capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0))) |
|
1436 ) { |
|
1437 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1438 return 0; |
|
1439 } |
|
1440 |
|
1441 if(capacity>=trie->length) { |
|
1442 uprv_memcpy(data, trie->memory, trie->length); |
|
1443 } else { |
|
1444 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
1445 } |
|
1446 return trie->length; |
|
1447 } |
|
1448 |
|
1449 /* |
|
1450 * This is here to avoid a dependency from utrie2.cpp on utrie.c. |
|
1451 * This file already depends on utrie.c. |
|
1452 * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). |
|
1453 */ |
|
1454 U_CAPI int32_t U_EXPORT2 |
|
1455 utrie2_swapAnyVersion(const UDataSwapper *ds, |
|
1456 const void *inData, int32_t length, void *outData, |
|
1457 UErrorCode *pErrorCode) { |
|
1458 if(U_SUCCESS(*pErrorCode)) { |
|
1459 switch(utrie2_getVersion(inData, length, TRUE)) { |
|
1460 case 1: |
|
1461 return utrie_swap(ds, inData, length, outData, pErrorCode); |
|
1462 case 2: |
|
1463 return utrie2_swap(ds, inData, length, outData, pErrorCode); |
|
1464 default: |
|
1465 *pErrorCode=U_INVALID_FORMAT_ERROR; |
|
1466 return 0; |
|
1467 } |
|
1468 } |
|
1469 return 0; |
|
1470 } |