Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2001-2011, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: utrie2_builder.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2008sep26 (split off from utrie2.c) |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * This is a common implementation of a Unicode trie. |
michael@0 | 17 | * It is a kind of compressed, serializable table of 16- or 32-bit values associated with |
michael@0 | 18 | * Unicode code points (0..0x10ffff). |
michael@0 | 19 | * This is the second common version of a Unicode trie (hence the name UTrie2). |
michael@0 | 20 | * See utrie2.h for a comparison. |
michael@0 | 21 | * |
michael@0 | 22 | * This file contains only the builder code. |
michael@0 | 23 | * See utrie2.c for the runtime and enumeration code. |
michael@0 | 24 | */ |
michael@0 | 25 | #ifdef UTRIE2_DEBUG |
michael@0 | 26 | # include <stdio.h> |
michael@0 | 27 | #endif |
michael@0 | 28 | |
michael@0 | 29 | #include "unicode/utypes.h" |
michael@0 | 30 | #include "cmemory.h" |
michael@0 | 31 | #include "utrie2.h" |
michael@0 | 32 | #include "utrie2_impl.h" |
michael@0 | 33 | |
michael@0 | 34 | #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ |
michael@0 | 35 | |
michael@0 | 36 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 37 | |
michael@0 | 38 | /* Implementation notes ----------------------------------------------------- */ |
michael@0 | 39 | |
michael@0 | 40 | /* |
michael@0 | 41 | * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values |
michael@0 | 42 | * have been chosen to minimize trie sizes overall. |
michael@0 | 43 | * Most of the code is flexible enough to work with a range of values, |
michael@0 | 44 | * within certain limits. |
michael@0 | 45 | * |
michael@0 | 46 | * Exception: Support for separate values for lead surrogate code _units_ |
michael@0 | 47 | * vs. code _points_ was added after the constants were fixed, |
michael@0 | 48 | * and has not been tested nor particularly designed for different constant values. |
michael@0 | 49 | * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 |
michael@0 | 50 | * part and back.) |
michael@0 | 51 | * |
michael@0 | 52 | * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data |
michael@0 | 53 | * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH |
michael@0 | 54 | * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction |
michael@0 | 55 | * remapping) stops working. |
michael@0 | 56 | * |
michael@0 | 57 | * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() |
michael@0 | 58 | * assumes that a single index-2 block is used for 0x400 code points |
michael@0 | 59 | * corresponding to one lead surrogate. |
michael@0 | 60 | * |
michael@0 | 61 | * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains |
michael@0 | 62 | * more than one Unicode plane, and the split of the index-2 table into a BMP |
michael@0 | 63 | * part and a supplementary part, with a gap in between, would not work. |
michael@0 | 64 | * |
michael@0 | 65 | * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because |
michael@0 | 66 | * there is data with more than 64k distinct values, |
michael@0 | 67 | * for example for Unihan collation with a separate collation weight per |
michael@0 | 68 | * Han character. |
michael@0 | 69 | */ |
michael@0 | 70 | |
michael@0 | 71 | /* Building a trie ----------------------------------------------------------*/ |
michael@0 | 72 | |
michael@0 | 73 | enum { |
michael@0 | 74 | /** The null index-2 block, following the gap in the index-2 table. */ |
michael@0 | 75 | UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, |
michael@0 | 76 | |
michael@0 | 77 | /** The start of allocated index-2 blocks. */ |
michael@0 | 78 | UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, |
michael@0 | 79 | |
michael@0 | 80 | /** |
michael@0 | 81 | * The null data block. |
michael@0 | 82 | * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, |
michael@0 | 83 | * to work with 6-bit trail bytes from 2-byte UTF-8. |
michael@0 | 84 | */ |
michael@0 | 85 | UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, |
michael@0 | 86 | |
michael@0 | 87 | /** The start of allocated data blocks. */ |
michael@0 | 88 | UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, |
michael@0 | 89 | |
michael@0 | 90 | /** |
michael@0 | 91 | * The start of data blocks for U+0800 and above. |
michael@0 | 92 | * Below, compaction uses a block length of 64 for 2-byte UTF-8. |
michael@0 | 93 | * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. |
michael@0 | 94 | * Data values for 0x780 code points beyond ASCII. |
michael@0 | 95 | */ |
michael@0 | 96 | UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 |
michael@0 | 97 | }; |
michael@0 | 98 | |
michael@0 | 99 | /* Start with allocation of 16k data entries. */ |
michael@0 | 100 | #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) |
michael@0 | 101 | |
michael@0 | 102 | /* Grow about 8x each time. */ |
michael@0 | 103 | #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) |
michael@0 | 104 | |
michael@0 | 105 | static int32_t |
michael@0 | 106 | allocIndex2Block(UNewTrie2 *trie); |
michael@0 | 107 | |
michael@0 | 108 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 109 | utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { |
michael@0 | 110 | UTrie2 *trie; |
michael@0 | 111 | UNewTrie2 *newTrie; |
michael@0 | 112 | uint32_t *data; |
michael@0 | 113 | int32_t i, j; |
michael@0 | 114 | |
michael@0 | 115 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 116 | return NULL; |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
michael@0 | 120 | newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
michael@0 | 121 | data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); |
michael@0 | 122 | if(trie==NULL || newTrie==NULL || data==NULL) { |
michael@0 | 123 | uprv_free(trie); |
michael@0 | 124 | uprv_free(newTrie); |
michael@0 | 125 | uprv_free(data); |
michael@0 | 126 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 127 | return 0; |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | uprv_memset(trie, 0, sizeof(UTrie2)); |
michael@0 | 131 | trie->initialValue=initialValue; |
michael@0 | 132 | trie->errorValue=errorValue; |
michael@0 | 133 | trie->highStart=0x110000; |
michael@0 | 134 | trie->newTrie=newTrie; |
michael@0 | 135 | |
michael@0 | 136 | newTrie->data=data; |
michael@0 | 137 | newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; |
michael@0 | 138 | newTrie->initialValue=initialValue; |
michael@0 | 139 | newTrie->errorValue=errorValue; |
michael@0 | 140 | newTrie->highStart=0x110000; |
michael@0 | 141 | newTrie->firstFreeBlock=0; /* no free block in the list */ |
michael@0 | 142 | newTrie->isCompacted=FALSE; |
michael@0 | 143 | |
michael@0 | 144 | /* |
michael@0 | 145 | * preallocate and reset |
michael@0 | 146 | * - ASCII |
michael@0 | 147 | * - the bad-UTF-8-data block |
michael@0 | 148 | * - the null data block |
michael@0 | 149 | */ |
michael@0 | 150 | for(i=0; i<0x80; ++i) { |
michael@0 | 151 | newTrie->data[i]=initialValue; |
michael@0 | 152 | } |
michael@0 | 153 | for(; i<0xc0; ++i) { |
michael@0 | 154 | newTrie->data[i]=errorValue; |
michael@0 | 155 | } |
michael@0 | 156 | for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { |
michael@0 | 157 | newTrie->data[i]=initialValue; |
michael@0 | 158 | } |
michael@0 | 159 | newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; |
michael@0 | 160 | newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; |
michael@0 | 161 | |
michael@0 | 162 | /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ |
michael@0 | 163 | for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
michael@0 | 164 | newTrie->index2[i]=j; |
michael@0 | 165 | newTrie->map[i]=1; |
michael@0 | 166 | } |
michael@0 | 167 | /* reference counts for the bad-UTF-8-data block */ |
michael@0 | 168 | for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
michael@0 | 169 | newTrie->map[i]=0; |
michael@0 | 170 | } |
michael@0 | 171 | /* |
michael@0 | 172 | * Reference counts for the null data block: all blocks except for the ASCII blocks. |
michael@0 | 173 | * Plus 1 so that we don't drop this block during compaction. |
michael@0 | 174 | * Plus as many as needed for lead surrogate code points. |
michael@0 | 175 | */ |
michael@0 | 176 | /* i==newTrie->dataNullOffset */ |
michael@0 | 177 | newTrie->map[i++]= |
michael@0 | 178 | (0x110000>>UTRIE2_SHIFT_2)- |
michael@0 | 179 | (0x80>>UTRIE2_SHIFT_2)+ |
michael@0 | 180 | 1+ |
michael@0 | 181 | UTRIE2_LSCP_INDEX_2_LENGTH; |
michael@0 | 182 | j+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 183 | for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
michael@0 | 184 | newTrie->map[i]=0; |
michael@0 | 185 | } |
michael@0 | 186 | |
michael@0 | 187 | /* |
michael@0 | 188 | * set the remaining indexes in the BMP index-2 block |
michael@0 | 189 | * to the null data block |
michael@0 | 190 | */ |
michael@0 | 191 | for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { |
michael@0 | 192 | newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; |
michael@0 | 193 | } |
michael@0 | 194 | |
michael@0 | 195 | /* |
michael@0 | 196 | * Fill the index gap with impossible values so that compaction |
michael@0 | 197 | * does not overlap other index-2 blocks with the gap. |
michael@0 | 198 | */ |
michael@0 | 199 | for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { |
michael@0 | 200 | newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | /* set the indexes in the null index-2 block */ |
michael@0 | 204 | for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { |
michael@0 | 205 | newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; |
michael@0 | 206 | } |
michael@0 | 207 | newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
michael@0 | 208 | newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; |
michael@0 | 209 | |
michael@0 | 210 | /* set the index-1 indexes for the linear index-2 block */ |
michael@0 | 211 | for(i=0, j=0; |
michael@0 | 212 | i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
michael@0 | 213 | ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH |
michael@0 | 214 | ) { |
michael@0 | 215 | newTrie->index1[i]=j; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | /* set the remaining index-1 indexes to the null index-2 block */ |
michael@0 | 219 | for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
michael@0 | 220 | newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | /* |
michael@0 | 224 | * Preallocate and reset data for U+0080..U+07ff, |
michael@0 | 225 | * for 2-byte UTF-8 which will be compacted in 64-blocks |
michael@0 | 226 | * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. |
michael@0 | 227 | */ |
michael@0 | 228 | for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { |
michael@0 | 229 | utrie2_set32(trie, i, initialValue, pErrorCode); |
michael@0 | 230 | } |
michael@0 | 231 | |
michael@0 | 232 | return trie; |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | static UNewTrie2 * |
michael@0 | 236 | cloneBuilder(const UNewTrie2 *other) { |
michael@0 | 237 | UNewTrie2 *trie; |
michael@0 | 238 | |
michael@0 | 239 | trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
michael@0 | 240 | if(trie==NULL) { |
michael@0 | 241 | return NULL; |
michael@0 | 242 | } |
michael@0 | 243 | |
michael@0 | 244 | trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); |
michael@0 | 245 | if(trie->data==NULL) { |
michael@0 | 246 | uprv_free(trie); |
michael@0 | 247 | return NULL; |
michael@0 | 248 | } |
michael@0 | 249 | trie->dataCapacity=other->dataCapacity; |
michael@0 | 250 | |
michael@0 | 251 | /* clone data */ |
michael@0 | 252 | uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); |
michael@0 | 253 | uprv_memcpy(trie->index2, other->index2, other->index2Length*4); |
michael@0 | 254 | trie->index2NullOffset=other->index2NullOffset; |
michael@0 | 255 | trie->index2Length=other->index2Length; |
michael@0 | 256 | |
michael@0 | 257 | uprv_memcpy(trie->data, other->data, other->dataLength*4); |
michael@0 | 258 | trie->dataNullOffset=other->dataNullOffset; |
michael@0 | 259 | trie->dataLength=other->dataLength; |
michael@0 | 260 | |
michael@0 | 261 | /* reference counters */ |
michael@0 | 262 | if(other->isCompacted) { |
michael@0 | 263 | trie->firstFreeBlock=0; |
michael@0 | 264 | } else { |
michael@0 | 265 | uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4); |
michael@0 | 266 | trie->firstFreeBlock=other->firstFreeBlock; |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | trie->initialValue=other->initialValue; |
michael@0 | 270 | trie->errorValue=other->errorValue; |
michael@0 | 271 | trie->highStart=other->highStart; |
michael@0 | 272 | trie->isCompacted=other->isCompacted; |
michael@0 | 273 | |
michael@0 | 274 | return trie; |
michael@0 | 275 | } |
michael@0 | 276 | |
michael@0 | 277 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 278 | utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { |
michael@0 | 279 | UTrie2 *trie; |
michael@0 | 280 | |
michael@0 | 281 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 282 | return NULL; |
michael@0 | 283 | } |
michael@0 | 284 | if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
michael@0 | 285 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 286 | return NULL; |
michael@0 | 287 | } |
michael@0 | 288 | |
michael@0 | 289 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
michael@0 | 290 | if(trie==NULL) { |
michael@0 | 291 | return NULL; |
michael@0 | 292 | } |
michael@0 | 293 | uprv_memcpy(trie, other, sizeof(UTrie2)); |
michael@0 | 294 | |
michael@0 | 295 | if(other->memory!=NULL) { |
michael@0 | 296 | trie->memory=uprv_malloc(other->length); |
michael@0 | 297 | if(trie->memory!=NULL) { |
michael@0 | 298 | trie->isMemoryOwned=TRUE; |
michael@0 | 299 | uprv_memcpy(trie->memory, other->memory, other->length); |
michael@0 | 300 | |
michael@0 | 301 | /* make the clone's pointers point to its own memory */ |
michael@0 | 302 | trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); |
michael@0 | 303 | if(other->data16!=NULL) { |
michael@0 | 304 | trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); |
michael@0 | 305 | } |
michael@0 | 306 | if(other->data32!=NULL) { |
michael@0 | 307 | trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); |
michael@0 | 308 | } |
michael@0 | 309 | } |
michael@0 | 310 | } else /* other->newTrie!=NULL */ { |
michael@0 | 311 | trie->newTrie=cloneBuilder(other->newTrie); |
michael@0 | 312 | } |
michael@0 | 313 | |
michael@0 | 314 | if(trie->memory==NULL && trie->newTrie==NULL) { |
michael@0 | 315 | uprv_free(trie); |
michael@0 | 316 | trie=NULL; |
michael@0 | 317 | } |
michael@0 | 318 | return trie; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | typedef struct NewTrieAndStatus { |
michael@0 | 322 | UTrie2 *trie; |
michael@0 | 323 | UErrorCode errorCode; |
michael@0 | 324 | UBool exclusiveLimit; /* rather than inclusive range end */ |
michael@0 | 325 | } NewTrieAndStatus; |
michael@0 | 326 | |
michael@0 | 327 | static UBool U_CALLCONV |
michael@0 | 328 | copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
michael@0 | 329 | NewTrieAndStatus *nt=(NewTrieAndStatus *)context; |
michael@0 | 330 | if(value!=nt->trie->initialValue) { |
michael@0 | 331 | if(nt->exclusiveLimit) { |
michael@0 | 332 | --end; |
michael@0 | 333 | } |
michael@0 | 334 | if(start==end) { |
michael@0 | 335 | utrie2_set32(nt->trie, start, value, &nt->errorCode); |
michael@0 | 336 | } else { |
michael@0 | 337 | utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); |
michael@0 | 338 | } |
michael@0 | 339 | return U_SUCCESS(nt->errorCode); |
michael@0 | 340 | } else { |
michael@0 | 341 | return TRUE; |
michael@0 | 342 | } |
michael@0 | 343 | } |
michael@0 | 344 | |
michael@0 | 345 | #ifdef UTRIE2_DEBUG |
michael@0 | 346 | static void |
michael@0 | 347 | utrie_printLengths(const UTrie *trie) { |
michael@0 | 348 | long indexLength=trie->indexLength; |
michael@0 | 349 | long dataLength=(long)trie->dataLength; |
michael@0 | 350 | long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); |
michael@0 | 351 | printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", |
michael@0 | 352 | indexLength, dataLength, totalLength); |
michael@0 | 353 | } |
michael@0 | 354 | |
michael@0 | 355 | static void |
michael@0 | 356 | utrie2_printLengths(const UTrie2 *trie, const char *which) { |
michael@0 | 357 | long indexLength=trie->indexLength; |
michael@0 | 358 | long dataLength=(long)trie->dataLength; |
michael@0 | 359 | long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); |
michael@0 | 360 | printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", |
michael@0 | 361 | which, indexLength, dataLength, totalLength); |
michael@0 | 362 | } |
michael@0 | 363 | #endif |
michael@0 | 364 | |
michael@0 | 365 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 366 | utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { |
michael@0 | 367 | NewTrieAndStatus context; |
michael@0 | 368 | UChar lead; |
michael@0 | 369 | |
michael@0 | 370 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 371 | return NULL; |
michael@0 | 372 | } |
michael@0 | 373 | if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
michael@0 | 374 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 375 | return NULL; |
michael@0 | 376 | } |
michael@0 | 377 | if(other->newTrie!=NULL && !other->newTrie->isCompacted) { |
michael@0 | 378 | return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ |
michael@0 | 379 | } |
michael@0 | 380 | |
michael@0 | 381 | /* Clone the frozen trie by enumerating it and building a new one. */ |
michael@0 | 382 | context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); |
michael@0 | 383 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 384 | return NULL; |
michael@0 | 385 | } |
michael@0 | 386 | context.exclusiveLimit=FALSE; |
michael@0 | 387 | context.errorCode=*pErrorCode; |
michael@0 | 388 | utrie2_enum(other, NULL, copyEnumRange, &context); |
michael@0 | 389 | *pErrorCode=context.errorCode; |
michael@0 | 390 | for(lead=0xd800; lead<0xdc00; ++lead) { |
michael@0 | 391 | uint32_t value; |
michael@0 | 392 | if(other->data32==NULL) { |
michael@0 | 393 | value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); |
michael@0 | 394 | } else { |
michael@0 | 395 | value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); |
michael@0 | 396 | } |
michael@0 | 397 | if(value!=other->initialValue) { |
michael@0 | 398 | utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); |
michael@0 | 399 | } |
michael@0 | 400 | } |
michael@0 | 401 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 402 | utrie2_close(context.trie); |
michael@0 | 403 | context.trie=NULL; |
michael@0 | 404 | } |
michael@0 | 405 | return context.trie; |
michael@0 | 406 | } |
michael@0 | 407 | |
michael@0 | 408 | /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ |
michael@0 | 409 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 410 | utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { |
michael@0 | 411 | NewTrieAndStatus context; |
michael@0 | 412 | UChar lead; |
michael@0 | 413 | |
michael@0 | 414 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 415 | return NULL; |
michael@0 | 416 | } |
michael@0 | 417 | if(trie1==NULL) { |
michael@0 | 418 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 419 | return NULL; |
michael@0 | 420 | } |
michael@0 | 421 | context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); |
michael@0 | 422 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 423 | return NULL; |
michael@0 | 424 | } |
michael@0 | 425 | context.exclusiveLimit=TRUE; |
michael@0 | 426 | context.errorCode=*pErrorCode; |
michael@0 | 427 | utrie_enum(trie1, NULL, copyEnumRange, &context); |
michael@0 | 428 | *pErrorCode=context.errorCode; |
michael@0 | 429 | for(lead=0xd800; lead<0xdc00; ++lead) { |
michael@0 | 430 | uint32_t value; |
michael@0 | 431 | if(trie1->data32==NULL) { |
michael@0 | 432 | value=UTRIE_GET16_FROM_LEAD(trie1, lead); |
michael@0 | 433 | } else { |
michael@0 | 434 | value=UTRIE_GET32_FROM_LEAD(trie1, lead); |
michael@0 | 435 | } |
michael@0 | 436 | if(value!=trie1->initialValue) { |
michael@0 | 437 | utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); |
michael@0 | 438 | } |
michael@0 | 439 | } |
michael@0 | 440 | if(U_SUCCESS(*pErrorCode)) { |
michael@0 | 441 | utrie2_freeze(context.trie, |
michael@0 | 442 | trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, |
michael@0 | 443 | pErrorCode); |
michael@0 | 444 | } |
michael@0 | 445 | #ifdef UTRIE2_DEBUG |
michael@0 | 446 | if(U_SUCCESS(*pErrorCode)) { |
michael@0 | 447 | utrie_printLengths(trie1); |
michael@0 | 448 | utrie2_printLengths(context.trie, "fromUTrie"); |
michael@0 | 449 | } |
michael@0 | 450 | #endif |
michael@0 | 451 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 452 | utrie2_close(context.trie); |
michael@0 | 453 | context.trie=NULL; |
michael@0 | 454 | } |
michael@0 | 455 | return context.trie; |
michael@0 | 456 | } |
michael@0 | 457 | |
michael@0 | 458 | static inline UBool |
michael@0 | 459 | isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
michael@0 | 460 | int32_t i2, block; |
michael@0 | 461 | |
michael@0 | 462 | if(U_IS_LEAD(c) && forLSCP) { |
michael@0 | 463 | i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ |
michael@0 | 464 | (c>>UTRIE2_SHIFT_2); |
michael@0 | 465 | } else { |
michael@0 | 466 | i2=trie->index1[c>>UTRIE2_SHIFT_1]+ |
michael@0 | 467 | ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); |
michael@0 | 468 | } |
michael@0 | 469 | block=trie->index2[i2]; |
michael@0 | 470 | return (UBool)(block==trie->dataNullOffset); |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | static int32_t |
michael@0 | 474 | allocIndex2Block(UNewTrie2 *trie) { |
michael@0 | 475 | int32_t newBlock, newTop; |
michael@0 | 476 | |
michael@0 | 477 | newBlock=trie->index2Length; |
michael@0 | 478 | newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; |
michael@0 | 479 | if(newTop>LENGTHOF(trie->index2)) { |
michael@0 | 480 | /* |
michael@0 | 481 | * Should never occur. |
michael@0 | 482 | * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, |
michael@0 | 483 | * or the code writes more values than should be possible. |
michael@0 | 484 | */ |
michael@0 | 485 | return -1; |
michael@0 | 486 | } |
michael@0 | 487 | trie->index2Length=newTop; |
michael@0 | 488 | uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); |
michael@0 | 489 | return newBlock; |
michael@0 | 490 | } |
michael@0 | 491 | |
michael@0 | 492 | static int32_t |
michael@0 | 493 | getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
michael@0 | 494 | int32_t i1, i2; |
michael@0 | 495 | |
michael@0 | 496 | if(U_IS_LEAD(c) && forLSCP) { |
michael@0 | 497 | return UTRIE2_LSCP_INDEX_2_OFFSET; |
michael@0 | 498 | } |
michael@0 | 499 | |
michael@0 | 500 | i1=c>>UTRIE2_SHIFT_1; |
michael@0 | 501 | i2=trie->index1[i1]; |
michael@0 | 502 | if(i2==trie->index2NullOffset) { |
michael@0 | 503 | i2=allocIndex2Block(trie); |
michael@0 | 504 | if(i2<0) { |
michael@0 | 505 | return -1; /* program error */ |
michael@0 | 506 | } |
michael@0 | 507 | trie->index1[i1]=i2; |
michael@0 | 508 | } |
michael@0 | 509 | return i2; |
michael@0 | 510 | } |
michael@0 | 511 | |
michael@0 | 512 | static int32_t |
michael@0 | 513 | allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { |
michael@0 | 514 | int32_t newBlock, newTop; |
michael@0 | 515 | |
michael@0 | 516 | if(trie->firstFreeBlock!=0) { |
michael@0 | 517 | /* get the first free block */ |
michael@0 | 518 | newBlock=trie->firstFreeBlock; |
michael@0 | 519 | trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; |
michael@0 | 520 | } else { |
michael@0 | 521 | /* get a new block from the high end */ |
michael@0 | 522 | newBlock=trie->dataLength; |
michael@0 | 523 | newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 524 | if(newTop>trie->dataCapacity) { |
michael@0 | 525 | /* out of memory in the data array */ |
michael@0 | 526 | int32_t capacity; |
michael@0 | 527 | uint32_t *data; |
michael@0 | 528 | |
michael@0 | 529 | if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { |
michael@0 | 530 | capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; |
michael@0 | 531 | } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { |
michael@0 | 532 | capacity=UNEWTRIE2_MAX_DATA_LENGTH; |
michael@0 | 533 | } else { |
michael@0 | 534 | /* |
michael@0 | 535 | * Should never occur. |
michael@0 | 536 | * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, |
michael@0 | 537 | * or the code writes more values than should be possible. |
michael@0 | 538 | */ |
michael@0 | 539 | return -1; |
michael@0 | 540 | } |
michael@0 | 541 | data=(uint32_t *)uprv_malloc(capacity*4); |
michael@0 | 542 | if(data==NULL) { |
michael@0 | 543 | return -1; |
michael@0 | 544 | } |
michael@0 | 545 | uprv_memcpy(data, trie->data, trie->dataLength*4); |
michael@0 | 546 | uprv_free(trie->data); |
michael@0 | 547 | trie->data=data; |
michael@0 | 548 | trie->dataCapacity=capacity; |
michael@0 | 549 | } |
michael@0 | 550 | trie->dataLength=newTop; |
michael@0 | 551 | } |
michael@0 | 552 | uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); |
michael@0 | 553 | trie->map[newBlock>>UTRIE2_SHIFT_2]=0; |
michael@0 | 554 | return newBlock; |
michael@0 | 555 | } |
michael@0 | 556 | |
michael@0 | 557 | /* call when the block's reference counter reaches 0 */ |
michael@0 | 558 | static void |
michael@0 | 559 | releaseDataBlock(UNewTrie2 *trie, int32_t block) { |
michael@0 | 560 | /* put this block at the front of the free-block chain */ |
michael@0 | 561 | trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; |
michael@0 | 562 | trie->firstFreeBlock=block; |
michael@0 | 563 | } |
michael@0 | 564 | |
michael@0 | 565 | static inline UBool |
michael@0 | 566 | isWritableBlock(UNewTrie2 *trie, int32_t block) { |
michael@0 | 567 | return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); |
michael@0 | 568 | } |
michael@0 | 569 | |
michael@0 | 570 | static inline void |
michael@0 | 571 | setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { |
michael@0 | 572 | int32_t oldBlock; |
michael@0 | 573 | ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ |
michael@0 | 574 | oldBlock=trie->index2[i2]; |
michael@0 | 575 | if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { |
michael@0 | 576 | releaseDataBlock(trie, oldBlock); |
michael@0 | 577 | } |
michael@0 | 578 | trie->index2[i2]=block; |
michael@0 | 579 | } |
michael@0 | 580 | |
michael@0 | 581 | /** |
michael@0 | 582 | * No error checking for illegal arguments. |
michael@0 | 583 | * |
michael@0 | 584 | * @return -1 if no new data block available (out of memory in data array) |
michael@0 | 585 | * @internal |
michael@0 | 586 | */ |
michael@0 | 587 | static int32_t |
michael@0 | 588 | getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
michael@0 | 589 | int32_t i2, oldBlock, newBlock; |
michael@0 | 590 | |
michael@0 | 591 | i2=getIndex2Block(trie, c, forLSCP); |
michael@0 | 592 | if(i2<0) { |
michael@0 | 593 | return -1; /* program error */ |
michael@0 | 594 | } |
michael@0 | 595 | |
michael@0 | 596 | i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
michael@0 | 597 | oldBlock=trie->index2[i2]; |
michael@0 | 598 | if(isWritableBlock(trie, oldBlock)) { |
michael@0 | 599 | return oldBlock; |
michael@0 | 600 | } |
michael@0 | 601 | |
michael@0 | 602 | /* allocate a new data block */ |
michael@0 | 603 | newBlock=allocDataBlock(trie, oldBlock); |
michael@0 | 604 | if(newBlock<0) { |
michael@0 | 605 | /* out of memory in the data array */ |
michael@0 | 606 | return -1; |
michael@0 | 607 | } |
michael@0 | 608 | setIndex2Entry(trie, i2, newBlock); |
michael@0 | 609 | return newBlock; |
michael@0 | 610 | } |
michael@0 | 611 | |
michael@0 | 612 | /** |
michael@0 | 613 | * @return TRUE if the value was successfully set |
michael@0 | 614 | */ |
michael@0 | 615 | static void |
michael@0 | 616 | set32(UNewTrie2 *trie, |
michael@0 | 617 | UChar32 c, UBool forLSCP, uint32_t value, |
michael@0 | 618 | UErrorCode *pErrorCode) { |
michael@0 | 619 | int32_t block; |
michael@0 | 620 | |
michael@0 | 621 | if(trie==NULL || trie->isCompacted) { |
michael@0 | 622 | *pErrorCode=U_NO_WRITE_PERMISSION; |
michael@0 | 623 | return; |
michael@0 | 624 | } |
michael@0 | 625 | |
michael@0 | 626 | block=getDataBlock(trie, c, forLSCP); |
michael@0 | 627 | if(block<0) { |
michael@0 | 628 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 629 | return; |
michael@0 | 630 | } |
michael@0 | 631 | |
michael@0 | 632 | trie->data[block+(c&UTRIE2_DATA_MASK)]=value; |
michael@0 | 633 | } |
michael@0 | 634 | |
michael@0 | 635 | U_CAPI void U_EXPORT2 |
michael@0 | 636 | utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { |
michael@0 | 637 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 638 | return; |
michael@0 | 639 | } |
michael@0 | 640 | if((uint32_t)c>0x10ffff) { |
michael@0 | 641 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 642 | return; |
michael@0 | 643 | } |
michael@0 | 644 | set32(trie->newTrie, c, TRUE, value, pErrorCode); |
michael@0 | 645 | } |
michael@0 | 646 | |
michael@0 | 647 | U_CAPI void U_EXPORT2 |
michael@0 | 648 | utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, |
michael@0 | 649 | UChar32 c, uint32_t value, |
michael@0 | 650 | UErrorCode *pErrorCode) { |
michael@0 | 651 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 652 | return; |
michael@0 | 653 | } |
michael@0 | 654 | if(!U_IS_LEAD(c)) { |
michael@0 | 655 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 656 | return; |
michael@0 | 657 | } |
michael@0 | 658 | set32(trie->newTrie, c, FALSE, value, pErrorCode); |
michael@0 | 659 | } |
michael@0 | 660 | |
michael@0 | 661 | static void |
michael@0 | 662 | writeBlock(uint32_t *block, uint32_t value) { |
michael@0 | 663 | uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 664 | while(block<limit) { |
michael@0 | 665 | *block++=value; |
michael@0 | 666 | } |
michael@0 | 667 | } |
michael@0 | 668 | |
michael@0 | 669 | /** |
michael@0 | 670 | * initialValue is ignored if overwrite=TRUE |
michael@0 | 671 | * @internal |
michael@0 | 672 | */ |
michael@0 | 673 | static void |
michael@0 | 674 | fillBlock(uint32_t *block, UChar32 start, UChar32 limit, |
michael@0 | 675 | uint32_t value, uint32_t initialValue, UBool overwrite) { |
michael@0 | 676 | uint32_t *pLimit; |
michael@0 | 677 | |
michael@0 | 678 | pLimit=block+limit; |
michael@0 | 679 | block+=start; |
michael@0 | 680 | if(overwrite) { |
michael@0 | 681 | while(block<pLimit) { |
michael@0 | 682 | *block++=value; |
michael@0 | 683 | } |
michael@0 | 684 | } else { |
michael@0 | 685 | while(block<pLimit) { |
michael@0 | 686 | if(*block==initialValue) { |
michael@0 | 687 | *block=value; |
michael@0 | 688 | } |
michael@0 | 689 | ++block; |
michael@0 | 690 | } |
michael@0 | 691 | } |
michael@0 | 692 | } |
michael@0 | 693 | |
michael@0 | 694 | U_CAPI void U_EXPORT2 |
michael@0 | 695 | utrie2_setRange32(UTrie2 *trie, |
michael@0 | 696 | UChar32 start, UChar32 end, |
michael@0 | 697 | uint32_t value, UBool overwrite, |
michael@0 | 698 | UErrorCode *pErrorCode) { |
michael@0 | 699 | /* |
michael@0 | 700 | * repeat value in [start..end] |
michael@0 | 701 | * mark index values for repeat-data blocks by setting bit 31 of the index values |
michael@0 | 702 | * fill around existing values if any, if(overwrite) |
michael@0 | 703 | */ |
michael@0 | 704 | UNewTrie2 *newTrie; |
michael@0 | 705 | int32_t block, rest, repeatBlock; |
michael@0 | 706 | UChar32 limit; |
michael@0 | 707 | |
michael@0 | 708 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 709 | return; |
michael@0 | 710 | } |
michael@0 | 711 | if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { |
michael@0 | 712 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 713 | return; |
michael@0 | 714 | } |
michael@0 | 715 | newTrie=trie->newTrie; |
michael@0 | 716 | if(newTrie==NULL || newTrie->isCompacted) { |
michael@0 | 717 | *pErrorCode=U_NO_WRITE_PERMISSION; |
michael@0 | 718 | return; |
michael@0 | 719 | } |
michael@0 | 720 | if(!overwrite && value==newTrie->initialValue) { |
michael@0 | 721 | return; /* nothing to do */ |
michael@0 | 722 | } |
michael@0 | 723 | |
michael@0 | 724 | limit=end+1; |
michael@0 | 725 | if(start&UTRIE2_DATA_MASK) { |
michael@0 | 726 | UChar32 nextStart; |
michael@0 | 727 | |
michael@0 | 728 | /* set partial block at [start..following block boundary[ */ |
michael@0 | 729 | block=getDataBlock(newTrie, start, TRUE); |
michael@0 | 730 | if(block<0) { |
michael@0 | 731 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 732 | return; |
michael@0 | 733 | } |
michael@0 | 734 | |
michael@0 | 735 | nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; |
michael@0 | 736 | if(nextStart<=limit) { |
michael@0 | 737 | fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, |
michael@0 | 738 | value, newTrie->initialValue, overwrite); |
michael@0 | 739 | start=nextStart; |
michael@0 | 740 | } else { |
michael@0 | 741 | fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, |
michael@0 | 742 | value, newTrie->initialValue, overwrite); |
michael@0 | 743 | return; |
michael@0 | 744 | } |
michael@0 | 745 | } |
michael@0 | 746 | |
michael@0 | 747 | /* number of positions in the last, partial block */ |
michael@0 | 748 | rest=limit&UTRIE2_DATA_MASK; |
michael@0 | 749 | |
michael@0 | 750 | /* round down limit to a block boundary */ |
michael@0 | 751 | limit&=~UTRIE2_DATA_MASK; |
michael@0 | 752 | |
michael@0 | 753 | /* iterate over all-value blocks */ |
michael@0 | 754 | if(value==newTrie->initialValue) { |
michael@0 | 755 | repeatBlock=newTrie->dataNullOffset; |
michael@0 | 756 | } else { |
michael@0 | 757 | repeatBlock=-1; |
michael@0 | 758 | } |
michael@0 | 759 | |
michael@0 | 760 | while(start<limit) { |
michael@0 | 761 | int32_t i2; |
michael@0 | 762 | UBool setRepeatBlock=FALSE; |
michael@0 | 763 | |
michael@0 | 764 | if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) { |
michael@0 | 765 | start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ |
michael@0 | 766 | continue; |
michael@0 | 767 | } |
michael@0 | 768 | |
michael@0 | 769 | /* get index value */ |
michael@0 | 770 | i2=getIndex2Block(newTrie, start, TRUE); |
michael@0 | 771 | if(i2<0) { |
michael@0 | 772 | *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 773 | return; |
michael@0 | 774 | } |
michael@0 | 775 | i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
michael@0 | 776 | block=newTrie->index2[i2]; |
michael@0 | 777 | if(isWritableBlock(newTrie, block)) { |
michael@0 | 778 | /* already allocated */ |
michael@0 | 779 | if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { |
michael@0 | 780 | /* |
michael@0 | 781 | * We overwrite all values, and it's not a |
michael@0 | 782 | * protected (ASCII-linear or 2-byte UTF-8) block: |
michael@0 | 783 | * replace with the repeatBlock. |
michael@0 | 784 | */ |
michael@0 | 785 | setRepeatBlock=TRUE; |
michael@0 | 786 | } else { |
michael@0 | 787 | /* !overwrite, or protected block: just write the values into this block */ |
michael@0 | 788 | fillBlock(newTrie->data+block, |
michael@0 | 789 | 0, UTRIE2_DATA_BLOCK_LENGTH, |
michael@0 | 790 | value, newTrie->initialValue, overwrite); |
michael@0 | 791 | } |
michael@0 | 792 | } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { |
michael@0 | 793 | /* |
michael@0 | 794 | * Set the repeatBlock instead of the null block or previous repeat block: |
michael@0 | 795 | * |
michael@0 | 796 | * If !isWritableBlock() then all entries in the block have the same value |
michael@0 | 797 | * because it's the null block or a range block (the repeatBlock from a previous |
michael@0 | 798 | * call to utrie2_setRange32()). |
michael@0 | 799 | * No other blocks are used multiple times before compacting. |
michael@0 | 800 | * |
michael@0 | 801 | * The null block is the only non-writable block with the initialValue because |
michael@0 | 802 | * of the repeatBlock initialization above. (If value==initialValue, then |
michael@0 | 803 | * the repeatBlock will be the null data block.) |
michael@0 | 804 | * |
michael@0 | 805 | * We set our repeatBlock if the desired value differs from the block's value, |
michael@0 | 806 | * and if we overwrite any data or if the data is all initial values |
michael@0 | 807 | * (which is the same as the block being the null block, see above). |
michael@0 | 808 | */ |
michael@0 | 809 | setRepeatBlock=TRUE; |
michael@0 | 810 | } |
michael@0 | 811 | if(setRepeatBlock) { |
michael@0 | 812 | if(repeatBlock>=0) { |
michael@0 | 813 | setIndex2Entry(newTrie, i2, repeatBlock); |
michael@0 | 814 | } else { |
michael@0 | 815 | /* create and set and fill the repeatBlock */ |
michael@0 | 816 | repeatBlock=getDataBlock(newTrie, start, TRUE); |
michael@0 | 817 | if(repeatBlock<0) { |
michael@0 | 818 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 819 | return; |
michael@0 | 820 | } |
michael@0 | 821 | writeBlock(newTrie->data+repeatBlock, value); |
michael@0 | 822 | } |
michael@0 | 823 | } |
michael@0 | 824 | |
michael@0 | 825 | start+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 826 | } |
michael@0 | 827 | |
michael@0 | 828 | if(rest>0) { |
michael@0 | 829 | /* set partial block at [last block boundary..limit[ */ |
michael@0 | 830 | block=getDataBlock(newTrie, start, TRUE); |
michael@0 | 831 | if(block<0) { |
michael@0 | 832 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 833 | return; |
michael@0 | 834 | } |
michael@0 | 835 | |
michael@0 | 836 | fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); |
michael@0 | 837 | } |
michael@0 | 838 | |
michael@0 | 839 | return; |
michael@0 | 840 | } |
michael@0 | 841 | |
michael@0 | 842 | /* compaction --------------------------------------------------------------- */ |
michael@0 | 843 | |
michael@0 | 844 | static inline UBool |
michael@0 | 845 | equal_int32(const int32_t *s, const int32_t *t, int32_t length) { |
michael@0 | 846 | while(length>0 && *s==*t) { |
michael@0 | 847 | ++s; |
michael@0 | 848 | ++t; |
michael@0 | 849 | --length; |
michael@0 | 850 | } |
michael@0 | 851 | return (UBool)(length==0); |
michael@0 | 852 | } |
michael@0 | 853 | |
michael@0 | 854 | static inline UBool |
michael@0 | 855 | equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { |
michael@0 | 856 | while(length>0 && *s==*t) { |
michael@0 | 857 | ++s; |
michael@0 | 858 | ++t; |
michael@0 | 859 | --length; |
michael@0 | 860 | } |
michael@0 | 861 | return (UBool)(length==0); |
michael@0 | 862 | } |
michael@0 | 863 | |
michael@0 | 864 | static int32_t |
michael@0 | 865 | findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { |
michael@0 | 866 | int32_t block; |
michael@0 | 867 | |
michael@0 | 868 | /* ensure that we do not even partially get past index2Length */ |
michael@0 | 869 | index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; |
michael@0 | 870 | |
michael@0 | 871 | for(block=0; block<=index2Length; ++block) { |
michael@0 | 872 | if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { |
michael@0 | 873 | return block; |
michael@0 | 874 | } |
michael@0 | 875 | } |
michael@0 | 876 | return -1; |
michael@0 | 877 | } |
michael@0 | 878 | |
michael@0 | 879 | static int32_t |
michael@0 | 880 | findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { |
michael@0 | 881 | int32_t block; |
michael@0 | 882 | |
michael@0 | 883 | /* ensure that we do not even partially get past dataLength */ |
michael@0 | 884 | dataLength-=blockLength; |
michael@0 | 885 | |
michael@0 | 886 | for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { |
michael@0 | 887 | if(equal_uint32(data+block, data+otherBlock, blockLength)) { |
michael@0 | 888 | return block; |
michael@0 | 889 | } |
michael@0 | 890 | } |
michael@0 | 891 | return -1; |
michael@0 | 892 | } |
michael@0 | 893 | |
michael@0 | 894 | /* |
michael@0 | 895 | * Find the start of the last range in the trie by enumerating backward. |
michael@0 | 896 | * Indexes for supplementary code points higher than this will be omitted. |
michael@0 | 897 | */ |
michael@0 | 898 | static UChar32 |
michael@0 | 899 | findHighStart(UNewTrie2 *trie, uint32_t highValue) { |
michael@0 | 900 | const uint32_t *data32; |
michael@0 | 901 | |
michael@0 | 902 | uint32_t value, initialValue; |
michael@0 | 903 | UChar32 c, prev; |
michael@0 | 904 | int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; |
michael@0 | 905 | |
michael@0 | 906 | data32=trie->data; |
michael@0 | 907 | initialValue=trie->initialValue; |
michael@0 | 908 | |
michael@0 | 909 | index2NullOffset=trie->index2NullOffset; |
michael@0 | 910 | nullBlock=trie->dataNullOffset; |
michael@0 | 911 | |
michael@0 | 912 | /* set variables for previous range */ |
michael@0 | 913 | if(highValue==initialValue) { |
michael@0 | 914 | prevI2Block=index2NullOffset; |
michael@0 | 915 | prevBlock=nullBlock; |
michael@0 | 916 | } else { |
michael@0 | 917 | prevI2Block=-1; |
michael@0 | 918 | prevBlock=-1; |
michael@0 | 919 | } |
michael@0 | 920 | prev=0x110000; |
michael@0 | 921 | |
michael@0 | 922 | /* enumerate index-2 blocks */ |
michael@0 | 923 | i1=UNEWTRIE2_INDEX_1_LENGTH; |
michael@0 | 924 | c=prev; |
michael@0 | 925 | while(c>0) { |
michael@0 | 926 | i2Block=trie->index1[--i1]; |
michael@0 | 927 | if(i2Block==prevI2Block) { |
michael@0 | 928 | /* the index-2 block is the same as the previous one, and filled with highValue */ |
michael@0 | 929 | c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
michael@0 | 930 | continue; |
michael@0 | 931 | } |
michael@0 | 932 | prevI2Block=i2Block; |
michael@0 | 933 | if(i2Block==index2NullOffset) { |
michael@0 | 934 | /* this is the null index-2 block */ |
michael@0 | 935 | if(highValue!=initialValue) { |
michael@0 | 936 | return c; |
michael@0 | 937 | } |
michael@0 | 938 | c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
michael@0 | 939 | } else { |
michael@0 | 940 | /* enumerate data blocks for one index-2 block */ |
michael@0 | 941 | for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { |
michael@0 | 942 | block=trie->index2[i2Block+ --i2]; |
michael@0 | 943 | if(block==prevBlock) { |
michael@0 | 944 | /* the block is the same as the previous one, and filled with highValue */ |
michael@0 | 945 | c-=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 946 | continue; |
michael@0 | 947 | } |
michael@0 | 948 | prevBlock=block; |
michael@0 | 949 | if(block==nullBlock) { |
michael@0 | 950 | /* this is the null data block */ |
michael@0 | 951 | if(highValue!=initialValue) { |
michael@0 | 952 | return c; |
michael@0 | 953 | } |
michael@0 | 954 | c-=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 955 | } else { |
michael@0 | 956 | for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { |
michael@0 | 957 | value=data32[block+ --j]; |
michael@0 | 958 | if(value!=highValue) { |
michael@0 | 959 | return c; |
michael@0 | 960 | } |
michael@0 | 961 | --c; |
michael@0 | 962 | } |
michael@0 | 963 | } |
michael@0 | 964 | } |
michael@0 | 965 | } |
michael@0 | 966 | } |
michael@0 | 967 | |
michael@0 | 968 | /* deliver last range */ |
michael@0 | 969 | return 0; |
michael@0 | 970 | } |
michael@0 | 971 | |
michael@0 | 972 | /* |
michael@0 | 973 | * Compact a build-time trie. |
michael@0 | 974 | * |
michael@0 | 975 | * The compaction |
michael@0 | 976 | * - removes blocks that are identical with earlier ones |
michael@0 | 977 | * - overlaps adjacent blocks as much as possible (if overlap==TRUE) |
michael@0 | 978 | * - moves blocks in steps of the data granularity |
michael@0 | 979 | * - moves and overlaps blocks that overlap with multiple values in the overlap region |
michael@0 | 980 | * |
michael@0 | 981 | * It does not |
michael@0 | 982 | * - try to move and overlap blocks that are not already adjacent |
michael@0 | 983 | */ |
michael@0 | 984 | static void |
michael@0 | 985 | compactData(UNewTrie2 *trie) { |
michael@0 | 986 | int32_t start, newStart, movedStart; |
michael@0 | 987 | int32_t blockLength, overlap; |
michael@0 | 988 | int32_t i, mapIndex, blockCount; |
michael@0 | 989 | |
michael@0 | 990 | /* do not compact linear-ASCII data */ |
michael@0 | 991 | newStart=UTRIE2_DATA_START_OFFSET; |
michael@0 | 992 | for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { |
michael@0 | 993 | trie->map[i]=start; |
michael@0 | 994 | } |
michael@0 | 995 | |
michael@0 | 996 | /* |
michael@0 | 997 | * Start with a block length of 64 for 2-byte UTF-8, |
michael@0 | 998 | * then switch to UTRIE2_DATA_BLOCK_LENGTH. |
michael@0 | 999 | */ |
michael@0 | 1000 | blockLength=64; |
michael@0 | 1001 | blockCount=blockLength>>UTRIE2_SHIFT_2; |
michael@0 | 1002 | for(start=newStart; start<trie->dataLength;) { |
michael@0 | 1003 | /* |
michael@0 | 1004 | * start: index of first entry of current block |
michael@0 | 1005 | * newStart: index where the current block is to be moved |
michael@0 | 1006 | * (right after current end of already-compacted data) |
michael@0 | 1007 | */ |
michael@0 | 1008 | if(start==UNEWTRIE2_DATA_0800_OFFSET) { |
michael@0 | 1009 | blockLength=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 1010 | blockCount=1; |
michael@0 | 1011 | } |
michael@0 | 1012 | |
michael@0 | 1013 | /* skip blocks that are not used */ |
michael@0 | 1014 | if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { |
michael@0 | 1015 | /* advance start to the next block */ |
michael@0 | 1016 | start+=blockLength; |
michael@0 | 1017 | |
michael@0 | 1018 | /* leave newStart with the previous block! */ |
michael@0 | 1019 | continue; |
michael@0 | 1020 | } |
michael@0 | 1021 | |
michael@0 | 1022 | /* search for an identical block */ |
michael@0 | 1023 | if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) |
michael@0 | 1024 | >=0 |
michael@0 | 1025 | ) { |
michael@0 | 1026 | /* found an identical block, set the other block's index value for the current block */ |
michael@0 | 1027 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
michael@0 | 1028 | trie->map[mapIndex++]=movedStart; |
michael@0 | 1029 | movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 1030 | } |
michael@0 | 1031 | |
michael@0 | 1032 | /* advance start to the next block */ |
michael@0 | 1033 | start+=blockLength; |
michael@0 | 1034 | |
michael@0 | 1035 | /* leave newStart with the previous block! */ |
michael@0 | 1036 | continue; |
michael@0 | 1037 | } |
michael@0 | 1038 | |
michael@0 | 1039 | /* see if the beginning of this block can be overlapped with the end of the previous block */ |
michael@0 | 1040 | /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ |
michael@0 | 1041 | for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; |
michael@0 | 1042 | overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); |
michael@0 | 1043 | overlap-=UTRIE2_DATA_GRANULARITY) {} |
michael@0 | 1044 | |
michael@0 | 1045 | if(overlap>0 || newStart<start) { |
michael@0 | 1046 | /* some overlap, or just move the whole block */ |
michael@0 | 1047 | movedStart=newStart-overlap; |
michael@0 | 1048 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
michael@0 | 1049 | trie->map[mapIndex++]=movedStart; |
michael@0 | 1050 | movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 1051 | } |
michael@0 | 1052 | |
michael@0 | 1053 | /* move the non-overlapping indexes to their new positions */ |
michael@0 | 1054 | start+=overlap; |
michael@0 | 1055 | for(i=blockLength-overlap; i>0; --i) { |
michael@0 | 1056 | trie->data[newStart++]=trie->data[start++]; |
michael@0 | 1057 | } |
michael@0 | 1058 | } else /* no overlap && newStart==start */ { |
michael@0 | 1059 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
michael@0 | 1060 | trie->map[mapIndex++]=start; |
michael@0 | 1061 | start+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 1062 | } |
michael@0 | 1063 | newStart=start; |
michael@0 | 1064 | } |
michael@0 | 1065 | } |
michael@0 | 1066 | |
michael@0 | 1067 | /* now adjust the index-2 table */ |
michael@0 | 1068 | for(i=0; i<trie->index2Length; ++i) { |
michael@0 | 1069 | if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { |
michael@0 | 1070 | /* Gap indexes are invalid (-1). Skip over the gap. */ |
michael@0 | 1071 | i+=UNEWTRIE2_INDEX_GAP_LENGTH; |
michael@0 | 1072 | } |
michael@0 | 1073 | trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; |
michael@0 | 1074 | } |
michael@0 | 1075 | trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; |
michael@0 | 1076 | |
michael@0 | 1077 | /* ensure dataLength alignment */ |
michael@0 | 1078 | while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
michael@0 | 1079 | trie->data[newStart++]=trie->initialValue; |
michael@0 | 1080 | } |
michael@0 | 1081 | |
michael@0 | 1082 | #ifdef UTRIE2_DEBUG |
michael@0 | 1083 | /* we saved some space */ |
michael@0 | 1084 | printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", |
michael@0 | 1085 | (long)trie->dataLength, (long)newStart); |
michael@0 | 1086 | #endif |
michael@0 | 1087 | |
michael@0 | 1088 | trie->dataLength=newStart; |
michael@0 | 1089 | } |
michael@0 | 1090 | |
michael@0 | 1091 | static void |
michael@0 | 1092 | compactIndex2(UNewTrie2 *trie) { |
michael@0 | 1093 | int32_t i, start, newStart, movedStart, overlap; |
michael@0 | 1094 | |
michael@0 | 1095 | /* do not compact linear-BMP index-2 blocks */ |
michael@0 | 1096 | newStart=UTRIE2_INDEX_2_BMP_LENGTH; |
michael@0 | 1097 | for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { |
michael@0 | 1098 | trie->map[i]=start; |
michael@0 | 1099 | } |
michael@0 | 1100 | |
michael@0 | 1101 | /* Reduce the index table gap to what will be needed at runtime. */ |
michael@0 | 1102 | newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); |
michael@0 | 1103 | |
michael@0 | 1104 | for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { |
michael@0 | 1105 | /* |
michael@0 | 1106 | * start: index of first entry of current block |
michael@0 | 1107 | * newStart: index where the current block is to be moved |
michael@0 | 1108 | * (right after current end of already-compacted data) |
michael@0 | 1109 | */ |
michael@0 | 1110 | |
michael@0 | 1111 | /* search for an identical block */ |
michael@0 | 1112 | if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) |
michael@0 | 1113 | >=0 |
michael@0 | 1114 | ) { |
michael@0 | 1115 | /* found an identical block, set the other block's index value for the current block */ |
michael@0 | 1116 | trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; |
michael@0 | 1117 | |
michael@0 | 1118 | /* advance start to the next block */ |
michael@0 | 1119 | start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
michael@0 | 1120 | |
michael@0 | 1121 | /* leave newStart with the previous block! */ |
michael@0 | 1122 | continue; |
michael@0 | 1123 | } |
michael@0 | 1124 | |
michael@0 | 1125 | /* see if the beginning of this block can be overlapped with the end of the previous block */ |
michael@0 | 1126 | /* look for maximum overlap with the previous, adjacent block */ |
michael@0 | 1127 | for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; |
michael@0 | 1128 | overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); |
michael@0 | 1129 | --overlap) {} |
michael@0 | 1130 | |
michael@0 | 1131 | if(overlap>0 || newStart<start) { |
michael@0 | 1132 | /* some overlap, or just move the whole block */ |
michael@0 | 1133 | trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; |
michael@0 | 1134 | |
michael@0 | 1135 | /* move the non-overlapping indexes to their new positions */ |
michael@0 | 1136 | start+=overlap; |
michael@0 | 1137 | for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { |
michael@0 | 1138 | trie->index2[newStart++]=trie->index2[start++]; |
michael@0 | 1139 | } |
michael@0 | 1140 | } else /* no overlap && newStart==start */ { |
michael@0 | 1141 | trie->map[start>>UTRIE2_SHIFT_1_2]=start; |
michael@0 | 1142 | start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
michael@0 | 1143 | newStart=start; |
michael@0 | 1144 | } |
michael@0 | 1145 | } |
michael@0 | 1146 | |
michael@0 | 1147 | /* now adjust the index-1 table */ |
michael@0 | 1148 | for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
michael@0 | 1149 | trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; |
michael@0 | 1150 | } |
michael@0 | 1151 | trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; |
michael@0 | 1152 | |
michael@0 | 1153 | /* |
michael@0 | 1154 | * Ensure data table alignment: |
michael@0 | 1155 | * Needs to be granularity-aligned for 16-bit trie |
michael@0 | 1156 | * (so that dataMove will be down-shiftable), |
michael@0 | 1157 | * and 2-aligned for uint32_t data. |
michael@0 | 1158 | */ |
michael@0 | 1159 | while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { |
michael@0 | 1160 | /* Arbitrary value: 0x3fffc not possible for real data. */ |
michael@0 | 1161 | trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; |
michael@0 | 1162 | } |
michael@0 | 1163 | |
michael@0 | 1164 | #ifdef UTRIE2_DEBUG |
michael@0 | 1165 | /* we saved some space */ |
michael@0 | 1166 | printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", |
michael@0 | 1167 | (long)trie->index2Length, (long)newStart); |
michael@0 | 1168 | #endif |
michael@0 | 1169 | |
michael@0 | 1170 | trie->index2Length=newStart; |
michael@0 | 1171 | } |
michael@0 | 1172 | |
michael@0 | 1173 | static void |
michael@0 | 1174 | compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { |
michael@0 | 1175 | UNewTrie2 *newTrie; |
michael@0 | 1176 | UChar32 highStart, suppHighStart; |
michael@0 | 1177 | uint32_t highValue; |
michael@0 | 1178 | |
michael@0 | 1179 | newTrie=trie->newTrie; |
michael@0 | 1180 | |
michael@0 | 1181 | /* find highStart and round it up */ |
michael@0 | 1182 | highValue=utrie2_get32(trie, 0x10ffff); |
michael@0 | 1183 | highStart=findHighStart(newTrie, highValue); |
michael@0 | 1184 | highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); |
michael@0 | 1185 | if(highStart==0x110000) { |
michael@0 | 1186 | highValue=trie->errorValue; |
michael@0 | 1187 | } |
michael@0 | 1188 | |
michael@0 | 1189 | /* |
michael@0 | 1190 | * Set trie->highStart only after utrie2_get32(trie, highStart). |
michael@0 | 1191 | * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. |
michael@0 | 1192 | */ |
michael@0 | 1193 | trie->highStart=newTrie->highStart=highStart; |
michael@0 | 1194 | |
michael@0 | 1195 | #ifdef UTRIE2_DEBUG |
michael@0 | 1196 | printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", |
michael@0 | 1197 | (long)highStart, (long)highValue, (long)trie->initialValue); |
michael@0 | 1198 | #endif |
michael@0 | 1199 | |
michael@0 | 1200 | if(highStart<0x110000) { |
michael@0 | 1201 | /* Blank out [highStart..10ffff] to release associated data blocks. */ |
michael@0 | 1202 | suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; |
michael@0 | 1203 | utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); |
michael@0 | 1204 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1205 | return; |
michael@0 | 1206 | } |
michael@0 | 1207 | } |
michael@0 | 1208 | |
michael@0 | 1209 | compactData(newTrie); |
michael@0 | 1210 | if(highStart>0x10000) { |
michael@0 | 1211 | compactIndex2(newTrie); |
michael@0 | 1212 | #ifdef UTRIE2_DEBUG |
michael@0 | 1213 | } else { |
michael@0 | 1214 | printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", |
michael@0 | 1215 | (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); |
michael@0 | 1216 | #endif |
michael@0 | 1217 | } |
michael@0 | 1218 | |
michael@0 | 1219 | /* |
michael@0 | 1220 | * Store the highValue in the data array and round up the dataLength. |
michael@0 | 1221 | * Must be done after compactData() because that assumes that dataLength |
michael@0 | 1222 | * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. |
michael@0 | 1223 | */ |
michael@0 | 1224 | newTrie->data[newTrie->dataLength++]=highValue; |
michael@0 | 1225 | while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
michael@0 | 1226 | newTrie->data[newTrie->dataLength++]=trie->initialValue; |
michael@0 | 1227 | } |
michael@0 | 1228 | |
michael@0 | 1229 | newTrie->isCompacted=TRUE; |
michael@0 | 1230 | } |
michael@0 | 1231 | |
michael@0 | 1232 | /* serialization ------------------------------------------------------------ */ |
michael@0 | 1233 | |
michael@0 | 1234 | /** |
michael@0 | 1235 | * Maximum length of the runtime index array. |
michael@0 | 1236 | * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. |
michael@0 | 1237 | * (The actual maximum length is lower, |
michael@0 | 1238 | * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) |
michael@0 | 1239 | */ |
michael@0 | 1240 | #define UTRIE2_MAX_INDEX_LENGTH 0xffff |
michael@0 | 1241 | |
michael@0 | 1242 | /** |
michael@0 | 1243 | * Maximum length of the runtime data array. |
michael@0 | 1244 | * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, |
michael@0 | 1245 | * and by uint16_t UTrie2Header.shiftedDataLength. |
michael@0 | 1246 | */ |
michael@0 | 1247 | #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) |
michael@0 | 1248 | |
michael@0 | 1249 | /* Compact and internally serialize the trie. */ |
michael@0 | 1250 | U_CAPI void U_EXPORT2 |
michael@0 | 1251 | utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { |
michael@0 | 1252 | UNewTrie2 *newTrie; |
michael@0 | 1253 | UTrie2Header *header; |
michael@0 | 1254 | uint32_t *p; |
michael@0 | 1255 | uint16_t *dest16; |
michael@0 | 1256 | int32_t i, length; |
michael@0 | 1257 | int32_t allIndexesLength; |
michael@0 | 1258 | int32_t dataMove; /* >0 if the data is moved to the end of the index array */ |
michael@0 | 1259 | UChar32 highStart; |
michael@0 | 1260 | |
michael@0 | 1261 | /* argument check */ |
michael@0 | 1262 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1263 | return; |
michael@0 | 1264 | } |
michael@0 | 1265 | if( trie==NULL || |
michael@0 | 1266 | valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits |
michael@0 | 1267 | ) { |
michael@0 | 1268 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1269 | return; |
michael@0 | 1270 | } |
michael@0 | 1271 | newTrie=trie->newTrie; |
michael@0 | 1272 | if(newTrie==NULL) { |
michael@0 | 1273 | /* already frozen */ |
michael@0 | 1274 | UTrie2ValueBits frozenValueBits= |
michael@0 | 1275 | trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; |
michael@0 | 1276 | if(valueBits!=frozenValueBits) { |
michael@0 | 1277 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1278 | } |
michael@0 | 1279 | return; |
michael@0 | 1280 | } |
michael@0 | 1281 | |
michael@0 | 1282 | /* compact if necessary */ |
michael@0 | 1283 | if(!newTrie->isCompacted) { |
michael@0 | 1284 | compactTrie(trie, pErrorCode); |
michael@0 | 1285 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1286 | return; |
michael@0 | 1287 | } |
michael@0 | 1288 | } |
michael@0 | 1289 | highStart=trie->highStart; |
michael@0 | 1290 | |
michael@0 | 1291 | if(highStart<=0x10000) { |
michael@0 | 1292 | allIndexesLength=UTRIE2_INDEX_1_OFFSET; |
michael@0 | 1293 | } else { |
michael@0 | 1294 | allIndexesLength=newTrie->index2Length; |
michael@0 | 1295 | } |
michael@0 | 1296 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 1297 | dataMove=allIndexesLength; |
michael@0 | 1298 | } else { |
michael@0 | 1299 | dataMove=0; |
michael@0 | 1300 | } |
michael@0 | 1301 | |
michael@0 | 1302 | /* are indexLength and dataLength within limits? */ |
michael@0 | 1303 | if( /* for unshifted indexLength */ |
michael@0 | 1304 | allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || |
michael@0 | 1305 | /* for unshifted dataNullOffset */ |
michael@0 | 1306 | (dataMove+newTrie->dataNullOffset)>0xffff || |
michael@0 | 1307 | /* for unshifted 2-byte UTF-8 index-2 values */ |
michael@0 | 1308 | (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || |
michael@0 | 1309 | /* for shiftedDataLength */ |
michael@0 | 1310 | (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH |
michael@0 | 1311 | ) { |
michael@0 | 1312 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1313 | return; |
michael@0 | 1314 | } |
michael@0 | 1315 | |
michael@0 | 1316 | /* calculate the total serialized length */ |
michael@0 | 1317 | length=sizeof(UTrie2Header)+allIndexesLength*2; |
michael@0 | 1318 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 1319 | length+=newTrie->dataLength*2; |
michael@0 | 1320 | } else { |
michael@0 | 1321 | length+=newTrie->dataLength*4; |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | trie->memory=uprv_malloc(length); |
michael@0 | 1325 | if(trie->memory==NULL) { |
michael@0 | 1326 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1327 | return; |
michael@0 | 1328 | } |
michael@0 | 1329 | trie->length=length; |
michael@0 | 1330 | trie->isMemoryOwned=TRUE; |
michael@0 | 1331 | |
michael@0 | 1332 | trie->indexLength=allIndexesLength; |
michael@0 | 1333 | trie->dataLength=newTrie->dataLength; |
michael@0 | 1334 | if(highStart<=0x10000) { |
michael@0 | 1335 | trie->index2NullOffset=0xffff; |
michael@0 | 1336 | } else { |
michael@0 | 1337 | trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; |
michael@0 | 1338 | } |
michael@0 | 1339 | trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); |
michael@0 | 1340 | trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; |
michael@0 | 1341 | |
michael@0 | 1342 | /* set the header fields */ |
michael@0 | 1343 | header=(UTrie2Header *)trie->memory; |
michael@0 | 1344 | |
michael@0 | 1345 | header->signature=UTRIE2_SIG; /* "Tri2" */ |
michael@0 | 1346 | header->options=(uint16_t)valueBits; |
michael@0 | 1347 | |
michael@0 | 1348 | header->indexLength=(uint16_t)trie->indexLength; |
michael@0 | 1349 | header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); |
michael@0 | 1350 | header->index2NullOffset=trie->index2NullOffset; |
michael@0 | 1351 | header->dataNullOffset=trie->dataNullOffset; |
michael@0 | 1352 | header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); |
michael@0 | 1353 | |
michael@0 | 1354 | /* fill the index and data arrays */ |
michael@0 | 1355 | dest16=(uint16_t *)(header+1); |
michael@0 | 1356 | trie->index=dest16; |
michael@0 | 1357 | |
michael@0 | 1358 | /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ |
michael@0 | 1359 | p=(uint32_t *)newTrie->index2; |
michael@0 | 1360 | for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { |
michael@0 | 1361 | *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
michael@0 | 1362 | } |
michael@0 | 1363 | |
michael@0 | 1364 | /* write UTF-8 2-byte index-2 values, not right-shifted */ |
michael@0 | 1365 | for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ |
michael@0 | 1366 | *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); |
michael@0 | 1367 | } |
michael@0 | 1368 | for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ |
michael@0 | 1369 | *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); |
michael@0 | 1370 | } |
michael@0 | 1371 | |
michael@0 | 1372 | if(highStart>0x10000) { |
michael@0 | 1373 | int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; |
michael@0 | 1374 | int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; |
michael@0 | 1375 | |
michael@0 | 1376 | /* write 16-bit index-1 values for supplementary code points */ |
michael@0 | 1377 | p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
michael@0 | 1378 | for(i=index1Length; i>0; --i) { |
michael@0 | 1379 | *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); |
michael@0 | 1380 | } |
michael@0 | 1381 | |
michael@0 | 1382 | /* |
michael@0 | 1383 | * write the index-2 array values for supplementary code points, |
michael@0 | 1384 | * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove |
michael@0 | 1385 | */ |
michael@0 | 1386 | p=(uint32_t *)newTrie->index2+index2Offset; |
michael@0 | 1387 | for(i=newTrie->index2Length-index2Offset; i>0; --i) { |
michael@0 | 1388 | *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
michael@0 | 1389 | } |
michael@0 | 1390 | } |
michael@0 | 1391 | |
michael@0 | 1392 | /* write the 16/32-bit data array */ |
michael@0 | 1393 | switch(valueBits) { |
michael@0 | 1394 | case UTRIE2_16_VALUE_BITS: |
michael@0 | 1395 | /* write 16-bit data values */ |
michael@0 | 1396 | trie->data16=dest16; |
michael@0 | 1397 | trie->data32=NULL; |
michael@0 | 1398 | p=newTrie->data; |
michael@0 | 1399 | for(i=newTrie->dataLength; i>0; --i) { |
michael@0 | 1400 | *dest16++=(uint16_t)*p++; |
michael@0 | 1401 | } |
michael@0 | 1402 | break; |
michael@0 | 1403 | case UTRIE2_32_VALUE_BITS: |
michael@0 | 1404 | /* write 32-bit data values */ |
michael@0 | 1405 | trie->data16=NULL; |
michael@0 | 1406 | trie->data32=(uint32_t *)dest16; |
michael@0 | 1407 | uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); |
michael@0 | 1408 | break; |
michael@0 | 1409 | default: |
michael@0 | 1410 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1411 | return; |
michael@0 | 1412 | } |
michael@0 | 1413 | |
michael@0 | 1414 | /* Delete the UNewTrie2. */ |
michael@0 | 1415 | uprv_free(newTrie->data); |
michael@0 | 1416 | uprv_free(newTrie); |
michael@0 | 1417 | trie->newTrie=NULL; |
michael@0 | 1418 | } |
michael@0 | 1419 | |
michael@0 | 1420 | U_CAPI UBool U_EXPORT2 |
michael@0 | 1421 | utrie2_isFrozen(const UTrie2 *trie) { |
michael@0 | 1422 | return (UBool)(trie->newTrie==NULL); |
michael@0 | 1423 | } |
michael@0 | 1424 | |
michael@0 | 1425 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1426 | utrie2_serialize(UTrie2 *trie, |
michael@0 | 1427 | void *data, int32_t capacity, |
michael@0 | 1428 | UErrorCode *pErrorCode) { |
michael@0 | 1429 | /* argument check */ |
michael@0 | 1430 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1431 | return 0; |
michael@0 | 1432 | } |
michael@0 | 1433 | |
michael@0 | 1434 | if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || |
michael@0 | 1435 | capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0))) |
michael@0 | 1436 | ) { |
michael@0 | 1437 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1438 | return 0; |
michael@0 | 1439 | } |
michael@0 | 1440 | |
michael@0 | 1441 | if(capacity>=trie->length) { |
michael@0 | 1442 | uprv_memcpy(data, trie->memory, trie->length); |
michael@0 | 1443 | } else { |
michael@0 | 1444 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1445 | } |
michael@0 | 1446 | return trie->length; |
michael@0 | 1447 | } |
michael@0 | 1448 | |
michael@0 | 1449 | /* |
michael@0 | 1450 | * This is here to avoid a dependency from utrie2.cpp on utrie.c. |
michael@0 | 1451 | * This file already depends on utrie.c. |
michael@0 | 1452 | * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). |
michael@0 | 1453 | */ |
michael@0 | 1454 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1455 | utrie2_swapAnyVersion(const UDataSwapper *ds, |
michael@0 | 1456 | const void *inData, int32_t length, void *outData, |
michael@0 | 1457 | UErrorCode *pErrorCode) { |
michael@0 | 1458 | if(U_SUCCESS(*pErrorCode)) { |
michael@0 | 1459 | switch(utrie2_getVersion(inData, length, TRUE)) { |
michael@0 | 1460 | case 1: |
michael@0 | 1461 | return utrie_swap(ds, inData, length, outData, pErrorCode); |
michael@0 | 1462 | case 2: |
michael@0 | 1463 | return utrie2_swap(ds, inData, length, outData, pErrorCode); |
michael@0 | 1464 | default: |
michael@0 | 1465 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 1466 | return 0; |
michael@0 | 1467 | } |
michael@0 | 1468 | } |
michael@0 | 1469 | return 0; |
michael@0 | 1470 | } |