1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/utrie2_builder.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1470 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2001-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: utrie2_builder.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2008sep26 (split off from utrie2.c) 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This is a common implementation of a Unicode trie. 1.20 +* It is a kind of compressed, serializable table of 16- or 32-bit values associated with 1.21 +* Unicode code points (0..0x10ffff). 1.22 +* This is the second common version of a Unicode trie (hence the name UTrie2). 1.23 +* See utrie2.h for a comparison. 1.24 +* 1.25 +* This file contains only the builder code. 1.26 +* See utrie2.c for the runtime and enumeration code. 1.27 +*/ 1.28 +#ifdef UTRIE2_DEBUG 1.29 +# include <stdio.h> 1.30 +#endif 1.31 + 1.32 +#include "unicode/utypes.h" 1.33 +#include "cmemory.h" 1.34 +#include "utrie2.h" 1.35 +#include "utrie2_impl.h" 1.36 + 1.37 +#include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ 1.38 + 1.39 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.40 + 1.41 +/* Implementation notes ----------------------------------------------------- */ 1.42 + 1.43 +/* 1.44 + * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values 1.45 + * have been chosen to minimize trie sizes overall. 1.46 + * Most of the code is flexible enough to work with a range of values, 1.47 + * within certain limits. 1.48 + * 1.49 + * Exception: Support for separate values for lead surrogate code _units_ 1.50 + * vs. code _points_ was added after the constants were fixed, 1.51 + * and has not been tested nor particularly designed for different constant values. 1.52 + * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 1.53 + * part and back.) 1.54 + * 1.55 + * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data 1.56 + * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH 1.57 + * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction 1.58 + * remapping) stops working. 1.59 + * 1.60 + * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() 1.61 + * assumes that a single index-2 block is used for 0x400 code points 1.62 + * corresponding to one lead surrogate. 1.63 + * 1.64 + * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains 1.65 + * more than one Unicode plane, and the split of the index-2 table into a BMP 1.66 + * part and a supplementary part, with a gap in between, would not work. 1.67 + * 1.68 + * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because 1.69 + * there is data with more than 64k distinct values, 1.70 + * for example for Unihan collation with a separate collation weight per 1.71 + * Han character. 1.72 + */ 1.73 + 1.74 +/* Building a trie ----------------------------------------------------------*/ 1.75 + 1.76 +enum { 1.77 + /** The null index-2 block, following the gap in the index-2 table. */ 1.78 + UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, 1.79 + 1.80 + /** The start of allocated index-2 blocks. */ 1.81 + UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, 1.82 + 1.83 + /** 1.84 + * The null data block. 1.85 + * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, 1.86 + * to work with 6-bit trail bytes from 2-byte UTF-8. 1.87 + */ 1.88 + UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, 1.89 + 1.90 + /** The start of allocated data blocks. */ 1.91 + UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, 1.92 + 1.93 + /** 1.94 + * The start of data blocks for U+0800 and above. 1.95 + * Below, compaction uses a block length of 64 for 2-byte UTF-8. 1.96 + * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. 1.97 + * Data values for 0x780 code points beyond ASCII. 1.98 + */ 1.99 + UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 1.100 +}; 1.101 + 1.102 +/* Start with allocation of 16k data entries. */ 1.103 +#define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) 1.104 + 1.105 +/* Grow about 8x each time. */ 1.106 +#define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) 1.107 + 1.108 +static int32_t 1.109 +allocIndex2Block(UNewTrie2 *trie); 1.110 + 1.111 +U_CAPI UTrie2 * U_EXPORT2 1.112 +utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { 1.113 + UTrie2 *trie; 1.114 + UNewTrie2 *newTrie; 1.115 + uint32_t *data; 1.116 + int32_t i, j; 1.117 + 1.118 + if(U_FAILURE(*pErrorCode)) { 1.119 + return NULL; 1.120 + } 1.121 + 1.122 + trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 1.123 + newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); 1.124 + data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); 1.125 + if(trie==NULL || newTrie==NULL || data==NULL) { 1.126 + uprv_free(trie); 1.127 + uprv_free(newTrie); 1.128 + uprv_free(data); 1.129 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.130 + return 0; 1.131 + } 1.132 + 1.133 + uprv_memset(trie, 0, sizeof(UTrie2)); 1.134 + trie->initialValue=initialValue; 1.135 + trie->errorValue=errorValue; 1.136 + trie->highStart=0x110000; 1.137 + trie->newTrie=newTrie; 1.138 + 1.139 + newTrie->data=data; 1.140 + newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; 1.141 + newTrie->initialValue=initialValue; 1.142 + newTrie->errorValue=errorValue; 1.143 + newTrie->highStart=0x110000; 1.144 + newTrie->firstFreeBlock=0; /* no free block in the list */ 1.145 + newTrie->isCompacted=FALSE; 1.146 + 1.147 + /* 1.148 + * preallocate and reset 1.149 + * - ASCII 1.150 + * - the bad-UTF-8-data block 1.151 + * - the null data block 1.152 + */ 1.153 + for(i=0; i<0x80; ++i) { 1.154 + newTrie->data[i]=initialValue; 1.155 + } 1.156 + for(; i<0xc0; ++i) { 1.157 + newTrie->data[i]=errorValue; 1.158 + } 1.159 + for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { 1.160 + newTrie->data[i]=initialValue; 1.161 + } 1.162 + newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; 1.163 + newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; 1.164 + 1.165 + /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ 1.166 + for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 1.167 + newTrie->index2[i]=j; 1.168 + newTrie->map[i]=1; 1.169 + } 1.170 + /* reference counts for the bad-UTF-8-data block */ 1.171 + for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 1.172 + newTrie->map[i]=0; 1.173 + } 1.174 + /* 1.175 + * Reference counts for the null data block: all blocks except for the ASCII blocks. 1.176 + * Plus 1 so that we don't drop this block during compaction. 1.177 + * Plus as many as needed for lead surrogate code points. 1.178 + */ 1.179 + /* i==newTrie->dataNullOffset */ 1.180 + newTrie->map[i++]= 1.181 + (0x110000>>UTRIE2_SHIFT_2)- 1.182 + (0x80>>UTRIE2_SHIFT_2)+ 1.183 + 1+ 1.184 + UTRIE2_LSCP_INDEX_2_LENGTH; 1.185 + j+=UTRIE2_DATA_BLOCK_LENGTH; 1.186 + for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 1.187 + newTrie->map[i]=0; 1.188 + } 1.189 + 1.190 + /* 1.191 + * set the remaining indexes in the BMP index-2 block 1.192 + * to the null data block 1.193 + */ 1.194 + for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { 1.195 + newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; 1.196 + } 1.197 + 1.198 + /* 1.199 + * Fill the index gap with impossible values so that compaction 1.200 + * does not overlap other index-2 blocks with the gap. 1.201 + */ 1.202 + for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { 1.203 + newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; 1.204 + } 1.205 + 1.206 + /* set the indexes in the null index-2 block */ 1.207 + for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { 1.208 + newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; 1.209 + } 1.210 + newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; 1.211 + newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; 1.212 + 1.213 + /* set the index-1 indexes for the linear index-2 block */ 1.214 + for(i=0, j=0; 1.215 + i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 1.216 + ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH 1.217 + ) { 1.218 + newTrie->index1[i]=j; 1.219 + } 1.220 + 1.221 + /* set the remaining index-1 indexes to the null index-2 block */ 1.222 + for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 1.223 + newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; 1.224 + } 1.225 + 1.226 + /* 1.227 + * Preallocate and reset data for U+0080..U+07ff, 1.228 + * for 2-byte UTF-8 which will be compacted in 64-blocks 1.229 + * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. 1.230 + */ 1.231 + for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { 1.232 + utrie2_set32(trie, i, initialValue, pErrorCode); 1.233 + } 1.234 + 1.235 + return trie; 1.236 +} 1.237 + 1.238 +static UNewTrie2 * 1.239 +cloneBuilder(const UNewTrie2 *other) { 1.240 + UNewTrie2 *trie; 1.241 + 1.242 + trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); 1.243 + if(trie==NULL) { 1.244 + return NULL; 1.245 + } 1.246 + 1.247 + trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); 1.248 + if(trie->data==NULL) { 1.249 + uprv_free(trie); 1.250 + return NULL; 1.251 + } 1.252 + trie->dataCapacity=other->dataCapacity; 1.253 + 1.254 + /* clone data */ 1.255 + uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); 1.256 + uprv_memcpy(trie->index2, other->index2, other->index2Length*4); 1.257 + trie->index2NullOffset=other->index2NullOffset; 1.258 + trie->index2Length=other->index2Length; 1.259 + 1.260 + uprv_memcpy(trie->data, other->data, other->dataLength*4); 1.261 + trie->dataNullOffset=other->dataNullOffset; 1.262 + trie->dataLength=other->dataLength; 1.263 + 1.264 + /* reference counters */ 1.265 + if(other->isCompacted) { 1.266 + trie->firstFreeBlock=0; 1.267 + } else { 1.268 + uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4); 1.269 + trie->firstFreeBlock=other->firstFreeBlock; 1.270 + } 1.271 + 1.272 + trie->initialValue=other->initialValue; 1.273 + trie->errorValue=other->errorValue; 1.274 + trie->highStart=other->highStart; 1.275 + trie->isCompacted=other->isCompacted; 1.276 + 1.277 + return trie; 1.278 +} 1.279 + 1.280 +U_CAPI UTrie2 * U_EXPORT2 1.281 +utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { 1.282 + UTrie2 *trie; 1.283 + 1.284 + if(U_FAILURE(*pErrorCode)) { 1.285 + return NULL; 1.286 + } 1.287 + if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { 1.288 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.289 + return NULL; 1.290 + } 1.291 + 1.292 + trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 1.293 + if(trie==NULL) { 1.294 + return NULL; 1.295 + } 1.296 + uprv_memcpy(trie, other, sizeof(UTrie2)); 1.297 + 1.298 + if(other->memory!=NULL) { 1.299 + trie->memory=uprv_malloc(other->length); 1.300 + if(trie->memory!=NULL) { 1.301 + trie->isMemoryOwned=TRUE; 1.302 + uprv_memcpy(trie->memory, other->memory, other->length); 1.303 + 1.304 + /* make the clone's pointers point to its own memory */ 1.305 + trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); 1.306 + if(other->data16!=NULL) { 1.307 + trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); 1.308 + } 1.309 + if(other->data32!=NULL) { 1.310 + trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); 1.311 + } 1.312 + } 1.313 + } else /* other->newTrie!=NULL */ { 1.314 + trie->newTrie=cloneBuilder(other->newTrie); 1.315 + } 1.316 + 1.317 + if(trie->memory==NULL && trie->newTrie==NULL) { 1.318 + uprv_free(trie); 1.319 + trie=NULL; 1.320 + } 1.321 + return trie; 1.322 +} 1.323 + 1.324 +typedef struct NewTrieAndStatus { 1.325 + UTrie2 *trie; 1.326 + UErrorCode errorCode; 1.327 + UBool exclusiveLimit; /* rather than inclusive range end */ 1.328 +} NewTrieAndStatus; 1.329 + 1.330 +static UBool U_CALLCONV 1.331 +copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1.332 + NewTrieAndStatus *nt=(NewTrieAndStatus *)context; 1.333 + if(value!=nt->trie->initialValue) { 1.334 + if(nt->exclusiveLimit) { 1.335 + --end; 1.336 + } 1.337 + if(start==end) { 1.338 + utrie2_set32(nt->trie, start, value, &nt->errorCode); 1.339 + } else { 1.340 + utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); 1.341 + } 1.342 + return U_SUCCESS(nt->errorCode); 1.343 + } else { 1.344 + return TRUE; 1.345 + } 1.346 +} 1.347 + 1.348 +#ifdef UTRIE2_DEBUG 1.349 +static void 1.350 +utrie_printLengths(const UTrie *trie) { 1.351 + long indexLength=trie->indexLength; 1.352 + long dataLength=(long)trie->dataLength; 1.353 + long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); 1.354 + printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", 1.355 + indexLength, dataLength, totalLength); 1.356 +} 1.357 + 1.358 +static void 1.359 +utrie2_printLengths(const UTrie2 *trie, const char *which) { 1.360 + long indexLength=trie->indexLength; 1.361 + long dataLength=(long)trie->dataLength; 1.362 + long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); 1.363 + printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", 1.364 + which, indexLength, dataLength, totalLength); 1.365 +} 1.366 +#endif 1.367 + 1.368 +U_CAPI UTrie2 * U_EXPORT2 1.369 +utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { 1.370 + NewTrieAndStatus context; 1.371 + UChar lead; 1.372 + 1.373 + if(U_FAILURE(*pErrorCode)) { 1.374 + return NULL; 1.375 + } 1.376 + if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { 1.377 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.378 + return NULL; 1.379 + } 1.380 + if(other->newTrie!=NULL && !other->newTrie->isCompacted) { 1.381 + return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ 1.382 + } 1.383 + 1.384 + /* Clone the frozen trie by enumerating it and building a new one. */ 1.385 + context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); 1.386 + if(U_FAILURE(*pErrorCode)) { 1.387 + return NULL; 1.388 + } 1.389 + context.exclusiveLimit=FALSE; 1.390 + context.errorCode=*pErrorCode; 1.391 + utrie2_enum(other, NULL, copyEnumRange, &context); 1.392 + *pErrorCode=context.errorCode; 1.393 + for(lead=0xd800; lead<0xdc00; ++lead) { 1.394 + uint32_t value; 1.395 + if(other->data32==NULL) { 1.396 + value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); 1.397 + } else { 1.398 + value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); 1.399 + } 1.400 + if(value!=other->initialValue) { 1.401 + utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 1.402 + } 1.403 + } 1.404 + if(U_FAILURE(*pErrorCode)) { 1.405 + utrie2_close(context.trie); 1.406 + context.trie=NULL; 1.407 + } 1.408 + return context.trie; 1.409 +} 1.410 + 1.411 +/* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ 1.412 +U_CAPI UTrie2 * U_EXPORT2 1.413 +utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { 1.414 + NewTrieAndStatus context; 1.415 + UChar lead; 1.416 + 1.417 + if(U_FAILURE(*pErrorCode)) { 1.418 + return NULL; 1.419 + } 1.420 + if(trie1==NULL) { 1.421 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.422 + return NULL; 1.423 + } 1.424 + context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); 1.425 + if(U_FAILURE(*pErrorCode)) { 1.426 + return NULL; 1.427 + } 1.428 + context.exclusiveLimit=TRUE; 1.429 + context.errorCode=*pErrorCode; 1.430 + utrie_enum(trie1, NULL, copyEnumRange, &context); 1.431 + *pErrorCode=context.errorCode; 1.432 + for(lead=0xd800; lead<0xdc00; ++lead) { 1.433 + uint32_t value; 1.434 + if(trie1->data32==NULL) { 1.435 + value=UTRIE_GET16_FROM_LEAD(trie1, lead); 1.436 + } else { 1.437 + value=UTRIE_GET32_FROM_LEAD(trie1, lead); 1.438 + } 1.439 + if(value!=trie1->initialValue) { 1.440 + utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 1.441 + } 1.442 + } 1.443 + if(U_SUCCESS(*pErrorCode)) { 1.444 + utrie2_freeze(context.trie, 1.445 + trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, 1.446 + pErrorCode); 1.447 + } 1.448 +#ifdef UTRIE2_DEBUG 1.449 + if(U_SUCCESS(*pErrorCode)) { 1.450 + utrie_printLengths(trie1); 1.451 + utrie2_printLengths(context.trie, "fromUTrie"); 1.452 + } 1.453 +#endif 1.454 + if(U_FAILURE(*pErrorCode)) { 1.455 + utrie2_close(context.trie); 1.456 + context.trie=NULL; 1.457 + } 1.458 + return context.trie; 1.459 +} 1.460 + 1.461 +static inline UBool 1.462 +isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 1.463 + int32_t i2, block; 1.464 + 1.465 + if(U_IS_LEAD(c) && forLSCP) { 1.466 + i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ 1.467 + (c>>UTRIE2_SHIFT_2); 1.468 + } else { 1.469 + i2=trie->index1[c>>UTRIE2_SHIFT_1]+ 1.470 + ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); 1.471 + } 1.472 + block=trie->index2[i2]; 1.473 + return (UBool)(block==trie->dataNullOffset); 1.474 +} 1.475 + 1.476 +static int32_t 1.477 +allocIndex2Block(UNewTrie2 *trie) { 1.478 + int32_t newBlock, newTop; 1.479 + 1.480 + newBlock=trie->index2Length; 1.481 + newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; 1.482 + if(newTop>LENGTHOF(trie->index2)) { 1.483 + /* 1.484 + * Should never occur. 1.485 + * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, 1.486 + * or the code writes more values than should be possible. 1.487 + */ 1.488 + return -1; 1.489 + } 1.490 + trie->index2Length=newTop; 1.491 + uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); 1.492 + return newBlock; 1.493 +} 1.494 + 1.495 +static int32_t 1.496 +getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 1.497 + int32_t i1, i2; 1.498 + 1.499 + if(U_IS_LEAD(c) && forLSCP) { 1.500 + return UTRIE2_LSCP_INDEX_2_OFFSET; 1.501 + } 1.502 + 1.503 + i1=c>>UTRIE2_SHIFT_1; 1.504 + i2=trie->index1[i1]; 1.505 + if(i2==trie->index2NullOffset) { 1.506 + i2=allocIndex2Block(trie); 1.507 + if(i2<0) { 1.508 + return -1; /* program error */ 1.509 + } 1.510 + trie->index1[i1]=i2; 1.511 + } 1.512 + return i2; 1.513 +} 1.514 + 1.515 +static int32_t 1.516 +allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { 1.517 + int32_t newBlock, newTop; 1.518 + 1.519 + if(trie->firstFreeBlock!=0) { 1.520 + /* get the first free block */ 1.521 + newBlock=trie->firstFreeBlock; 1.522 + trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; 1.523 + } else { 1.524 + /* get a new block from the high end */ 1.525 + newBlock=trie->dataLength; 1.526 + newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; 1.527 + if(newTop>trie->dataCapacity) { 1.528 + /* out of memory in the data array */ 1.529 + int32_t capacity; 1.530 + uint32_t *data; 1.531 + 1.532 + if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { 1.533 + capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; 1.534 + } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { 1.535 + capacity=UNEWTRIE2_MAX_DATA_LENGTH; 1.536 + } else { 1.537 + /* 1.538 + * Should never occur. 1.539 + * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, 1.540 + * or the code writes more values than should be possible. 1.541 + */ 1.542 + return -1; 1.543 + } 1.544 + data=(uint32_t *)uprv_malloc(capacity*4); 1.545 + if(data==NULL) { 1.546 + return -1; 1.547 + } 1.548 + uprv_memcpy(data, trie->data, trie->dataLength*4); 1.549 + uprv_free(trie->data); 1.550 + trie->data=data; 1.551 + trie->dataCapacity=capacity; 1.552 + } 1.553 + trie->dataLength=newTop; 1.554 + } 1.555 + uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); 1.556 + trie->map[newBlock>>UTRIE2_SHIFT_2]=0; 1.557 + return newBlock; 1.558 +} 1.559 + 1.560 +/* call when the block's reference counter reaches 0 */ 1.561 +static void 1.562 +releaseDataBlock(UNewTrie2 *trie, int32_t block) { 1.563 + /* put this block at the front of the free-block chain */ 1.564 + trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; 1.565 + trie->firstFreeBlock=block; 1.566 +} 1.567 + 1.568 +static inline UBool 1.569 +isWritableBlock(UNewTrie2 *trie, int32_t block) { 1.570 + return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); 1.571 +} 1.572 + 1.573 +static inline void 1.574 +setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { 1.575 + int32_t oldBlock; 1.576 + ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ 1.577 + oldBlock=trie->index2[i2]; 1.578 + if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { 1.579 + releaseDataBlock(trie, oldBlock); 1.580 + } 1.581 + trie->index2[i2]=block; 1.582 +} 1.583 + 1.584 +/** 1.585 + * No error checking for illegal arguments. 1.586 + * 1.587 + * @return -1 if no new data block available (out of memory in data array) 1.588 + * @internal 1.589 + */ 1.590 +static int32_t 1.591 +getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 1.592 + int32_t i2, oldBlock, newBlock; 1.593 + 1.594 + i2=getIndex2Block(trie, c, forLSCP); 1.595 + if(i2<0) { 1.596 + return -1; /* program error */ 1.597 + } 1.598 + 1.599 + i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 1.600 + oldBlock=trie->index2[i2]; 1.601 + if(isWritableBlock(trie, oldBlock)) { 1.602 + return oldBlock; 1.603 + } 1.604 + 1.605 + /* allocate a new data block */ 1.606 + newBlock=allocDataBlock(trie, oldBlock); 1.607 + if(newBlock<0) { 1.608 + /* out of memory in the data array */ 1.609 + return -1; 1.610 + } 1.611 + setIndex2Entry(trie, i2, newBlock); 1.612 + return newBlock; 1.613 +} 1.614 + 1.615 +/** 1.616 + * @return TRUE if the value was successfully set 1.617 + */ 1.618 +static void 1.619 +set32(UNewTrie2 *trie, 1.620 + UChar32 c, UBool forLSCP, uint32_t value, 1.621 + UErrorCode *pErrorCode) { 1.622 + int32_t block; 1.623 + 1.624 + if(trie==NULL || trie->isCompacted) { 1.625 + *pErrorCode=U_NO_WRITE_PERMISSION; 1.626 + return; 1.627 + } 1.628 + 1.629 + block=getDataBlock(trie, c, forLSCP); 1.630 + if(block<0) { 1.631 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.632 + return; 1.633 + } 1.634 + 1.635 + trie->data[block+(c&UTRIE2_DATA_MASK)]=value; 1.636 +} 1.637 + 1.638 +U_CAPI void U_EXPORT2 1.639 +utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { 1.640 + if(U_FAILURE(*pErrorCode)) { 1.641 + return; 1.642 + } 1.643 + if((uint32_t)c>0x10ffff) { 1.644 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.645 + return; 1.646 + } 1.647 + set32(trie->newTrie, c, TRUE, value, pErrorCode); 1.648 +} 1.649 + 1.650 +U_CAPI void U_EXPORT2 1.651 +utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, 1.652 + UChar32 c, uint32_t value, 1.653 + UErrorCode *pErrorCode) { 1.654 + if(U_FAILURE(*pErrorCode)) { 1.655 + return; 1.656 + } 1.657 + if(!U_IS_LEAD(c)) { 1.658 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.659 + return; 1.660 + } 1.661 + set32(trie->newTrie, c, FALSE, value, pErrorCode); 1.662 +} 1.663 + 1.664 +static void 1.665 +writeBlock(uint32_t *block, uint32_t value) { 1.666 + uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; 1.667 + while(block<limit) { 1.668 + *block++=value; 1.669 + } 1.670 +} 1.671 + 1.672 +/** 1.673 + * initialValue is ignored if overwrite=TRUE 1.674 + * @internal 1.675 + */ 1.676 +static void 1.677 +fillBlock(uint32_t *block, UChar32 start, UChar32 limit, 1.678 + uint32_t value, uint32_t initialValue, UBool overwrite) { 1.679 + uint32_t *pLimit; 1.680 + 1.681 + pLimit=block+limit; 1.682 + block+=start; 1.683 + if(overwrite) { 1.684 + while(block<pLimit) { 1.685 + *block++=value; 1.686 + } 1.687 + } else { 1.688 + while(block<pLimit) { 1.689 + if(*block==initialValue) { 1.690 + *block=value; 1.691 + } 1.692 + ++block; 1.693 + } 1.694 + } 1.695 +} 1.696 + 1.697 +U_CAPI void U_EXPORT2 1.698 +utrie2_setRange32(UTrie2 *trie, 1.699 + UChar32 start, UChar32 end, 1.700 + uint32_t value, UBool overwrite, 1.701 + UErrorCode *pErrorCode) { 1.702 + /* 1.703 + * repeat value in [start..end] 1.704 + * mark index values for repeat-data blocks by setting bit 31 of the index values 1.705 + * fill around existing values if any, if(overwrite) 1.706 + */ 1.707 + UNewTrie2 *newTrie; 1.708 + int32_t block, rest, repeatBlock; 1.709 + UChar32 limit; 1.710 + 1.711 + if(U_FAILURE(*pErrorCode)) { 1.712 + return; 1.713 + } 1.714 + if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { 1.715 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.716 + return; 1.717 + } 1.718 + newTrie=trie->newTrie; 1.719 + if(newTrie==NULL || newTrie->isCompacted) { 1.720 + *pErrorCode=U_NO_WRITE_PERMISSION; 1.721 + return; 1.722 + } 1.723 + if(!overwrite && value==newTrie->initialValue) { 1.724 + return; /* nothing to do */ 1.725 + } 1.726 + 1.727 + limit=end+1; 1.728 + if(start&UTRIE2_DATA_MASK) { 1.729 + UChar32 nextStart; 1.730 + 1.731 + /* set partial block at [start..following block boundary[ */ 1.732 + block=getDataBlock(newTrie, start, TRUE); 1.733 + if(block<0) { 1.734 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.735 + return; 1.736 + } 1.737 + 1.738 + nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; 1.739 + if(nextStart<=limit) { 1.740 + fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, 1.741 + value, newTrie->initialValue, overwrite); 1.742 + start=nextStart; 1.743 + } else { 1.744 + fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, 1.745 + value, newTrie->initialValue, overwrite); 1.746 + return; 1.747 + } 1.748 + } 1.749 + 1.750 + /* number of positions in the last, partial block */ 1.751 + rest=limit&UTRIE2_DATA_MASK; 1.752 + 1.753 + /* round down limit to a block boundary */ 1.754 + limit&=~UTRIE2_DATA_MASK; 1.755 + 1.756 + /* iterate over all-value blocks */ 1.757 + if(value==newTrie->initialValue) { 1.758 + repeatBlock=newTrie->dataNullOffset; 1.759 + } else { 1.760 + repeatBlock=-1; 1.761 + } 1.762 + 1.763 + while(start<limit) { 1.764 + int32_t i2; 1.765 + UBool setRepeatBlock=FALSE; 1.766 + 1.767 + if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) { 1.768 + start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ 1.769 + continue; 1.770 + } 1.771 + 1.772 + /* get index value */ 1.773 + i2=getIndex2Block(newTrie, start, TRUE); 1.774 + if(i2<0) { 1.775 + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1.776 + return; 1.777 + } 1.778 + i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 1.779 + block=newTrie->index2[i2]; 1.780 + if(isWritableBlock(newTrie, block)) { 1.781 + /* already allocated */ 1.782 + if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { 1.783 + /* 1.784 + * We overwrite all values, and it's not a 1.785 + * protected (ASCII-linear or 2-byte UTF-8) block: 1.786 + * replace with the repeatBlock. 1.787 + */ 1.788 + setRepeatBlock=TRUE; 1.789 + } else { 1.790 + /* !overwrite, or protected block: just write the values into this block */ 1.791 + fillBlock(newTrie->data+block, 1.792 + 0, UTRIE2_DATA_BLOCK_LENGTH, 1.793 + value, newTrie->initialValue, overwrite); 1.794 + } 1.795 + } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { 1.796 + /* 1.797 + * Set the repeatBlock instead of the null block or previous repeat block: 1.798 + * 1.799 + * If !isWritableBlock() then all entries in the block have the same value 1.800 + * because it's the null block or a range block (the repeatBlock from a previous 1.801 + * call to utrie2_setRange32()). 1.802 + * No other blocks are used multiple times before compacting. 1.803 + * 1.804 + * The null block is the only non-writable block with the initialValue because 1.805 + * of the repeatBlock initialization above. (If value==initialValue, then 1.806 + * the repeatBlock will be the null data block.) 1.807 + * 1.808 + * We set our repeatBlock if the desired value differs from the block's value, 1.809 + * and if we overwrite any data or if the data is all initial values 1.810 + * (which is the same as the block being the null block, see above). 1.811 + */ 1.812 + setRepeatBlock=TRUE; 1.813 + } 1.814 + if(setRepeatBlock) { 1.815 + if(repeatBlock>=0) { 1.816 + setIndex2Entry(newTrie, i2, repeatBlock); 1.817 + } else { 1.818 + /* create and set and fill the repeatBlock */ 1.819 + repeatBlock=getDataBlock(newTrie, start, TRUE); 1.820 + if(repeatBlock<0) { 1.821 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.822 + return; 1.823 + } 1.824 + writeBlock(newTrie->data+repeatBlock, value); 1.825 + } 1.826 + } 1.827 + 1.828 + start+=UTRIE2_DATA_BLOCK_LENGTH; 1.829 + } 1.830 + 1.831 + if(rest>0) { 1.832 + /* set partial block at [last block boundary..limit[ */ 1.833 + block=getDataBlock(newTrie, start, TRUE); 1.834 + if(block<0) { 1.835 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.836 + return; 1.837 + } 1.838 + 1.839 + fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); 1.840 + } 1.841 + 1.842 + return; 1.843 +} 1.844 + 1.845 +/* compaction --------------------------------------------------------------- */ 1.846 + 1.847 +static inline UBool 1.848 +equal_int32(const int32_t *s, const int32_t *t, int32_t length) { 1.849 + while(length>0 && *s==*t) { 1.850 + ++s; 1.851 + ++t; 1.852 + --length; 1.853 + } 1.854 + return (UBool)(length==0); 1.855 +} 1.856 + 1.857 +static inline UBool 1.858 +equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { 1.859 + while(length>0 && *s==*t) { 1.860 + ++s; 1.861 + ++t; 1.862 + --length; 1.863 + } 1.864 + return (UBool)(length==0); 1.865 +} 1.866 + 1.867 +static int32_t 1.868 +findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { 1.869 + int32_t block; 1.870 + 1.871 + /* ensure that we do not even partially get past index2Length */ 1.872 + index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; 1.873 + 1.874 + for(block=0; block<=index2Length; ++block) { 1.875 + if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { 1.876 + return block; 1.877 + } 1.878 + } 1.879 + return -1; 1.880 +} 1.881 + 1.882 +static int32_t 1.883 +findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { 1.884 + int32_t block; 1.885 + 1.886 + /* ensure that we do not even partially get past dataLength */ 1.887 + dataLength-=blockLength; 1.888 + 1.889 + for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { 1.890 + if(equal_uint32(data+block, data+otherBlock, blockLength)) { 1.891 + return block; 1.892 + } 1.893 + } 1.894 + return -1; 1.895 +} 1.896 + 1.897 +/* 1.898 + * Find the start of the last range in the trie by enumerating backward. 1.899 + * Indexes for supplementary code points higher than this will be omitted. 1.900 + */ 1.901 +static UChar32 1.902 +findHighStart(UNewTrie2 *trie, uint32_t highValue) { 1.903 + const uint32_t *data32; 1.904 + 1.905 + uint32_t value, initialValue; 1.906 + UChar32 c, prev; 1.907 + int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; 1.908 + 1.909 + data32=trie->data; 1.910 + initialValue=trie->initialValue; 1.911 + 1.912 + index2NullOffset=trie->index2NullOffset; 1.913 + nullBlock=trie->dataNullOffset; 1.914 + 1.915 + /* set variables for previous range */ 1.916 + if(highValue==initialValue) { 1.917 + prevI2Block=index2NullOffset; 1.918 + prevBlock=nullBlock; 1.919 + } else { 1.920 + prevI2Block=-1; 1.921 + prevBlock=-1; 1.922 + } 1.923 + prev=0x110000; 1.924 + 1.925 + /* enumerate index-2 blocks */ 1.926 + i1=UNEWTRIE2_INDEX_1_LENGTH; 1.927 + c=prev; 1.928 + while(c>0) { 1.929 + i2Block=trie->index1[--i1]; 1.930 + if(i2Block==prevI2Block) { 1.931 + /* the index-2 block is the same as the previous one, and filled with highValue */ 1.932 + c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 1.933 + continue; 1.934 + } 1.935 + prevI2Block=i2Block; 1.936 + if(i2Block==index2NullOffset) { 1.937 + /* this is the null index-2 block */ 1.938 + if(highValue!=initialValue) { 1.939 + return c; 1.940 + } 1.941 + c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 1.942 + } else { 1.943 + /* enumerate data blocks for one index-2 block */ 1.944 + for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { 1.945 + block=trie->index2[i2Block+ --i2]; 1.946 + if(block==prevBlock) { 1.947 + /* the block is the same as the previous one, and filled with highValue */ 1.948 + c-=UTRIE2_DATA_BLOCK_LENGTH; 1.949 + continue; 1.950 + } 1.951 + prevBlock=block; 1.952 + if(block==nullBlock) { 1.953 + /* this is the null data block */ 1.954 + if(highValue!=initialValue) { 1.955 + return c; 1.956 + } 1.957 + c-=UTRIE2_DATA_BLOCK_LENGTH; 1.958 + } else { 1.959 + for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { 1.960 + value=data32[block+ --j]; 1.961 + if(value!=highValue) { 1.962 + return c; 1.963 + } 1.964 + --c; 1.965 + } 1.966 + } 1.967 + } 1.968 + } 1.969 + } 1.970 + 1.971 + /* deliver last range */ 1.972 + return 0; 1.973 +} 1.974 + 1.975 +/* 1.976 + * Compact a build-time trie. 1.977 + * 1.978 + * The compaction 1.979 + * - removes blocks that are identical with earlier ones 1.980 + * - overlaps adjacent blocks as much as possible (if overlap==TRUE) 1.981 + * - moves blocks in steps of the data granularity 1.982 + * - moves and overlaps blocks that overlap with multiple values in the overlap region 1.983 + * 1.984 + * It does not 1.985 + * - try to move and overlap blocks that are not already adjacent 1.986 + */ 1.987 +static void 1.988 +compactData(UNewTrie2 *trie) { 1.989 + int32_t start, newStart, movedStart; 1.990 + int32_t blockLength, overlap; 1.991 + int32_t i, mapIndex, blockCount; 1.992 + 1.993 + /* do not compact linear-ASCII data */ 1.994 + newStart=UTRIE2_DATA_START_OFFSET; 1.995 + for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { 1.996 + trie->map[i]=start; 1.997 + } 1.998 + 1.999 + /* 1.1000 + * Start with a block length of 64 for 2-byte UTF-8, 1.1001 + * then switch to UTRIE2_DATA_BLOCK_LENGTH. 1.1002 + */ 1.1003 + blockLength=64; 1.1004 + blockCount=blockLength>>UTRIE2_SHIFT_2; 1.1005 + for(start=newStart; start<trie->dataLength;) { 1.1006 + /* 1.1007 + * start: index of first entry of current block 1.1008 + * newStart: index where the current block is to be moved 1.1009 + * (right after current end of already-compacted data) 1.1010 + */ 1.1011 + if(start==UNEWTRIE2_DATA_0800_OFFSET) { 1.1012 + blockLength=UTRIE2_DATA_BLOCK_LENGTH; 1.1013 + blockCount=1; 1.1014 + } 1.1015 + 1.1016 + /* skip blocks that are not used */ 1.1017 + if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { 1.1018 + /* advance start to the next block */ 1.1019 + start+=blockLength; 1.1020 + 1.1021 + /* leave newStart with the previous block! */ 1.1022 + continue; 1.1023 + } 1.1024 + 1.1025 + /* search for an identical block */ 1.1026 + if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) 1.1027 + >=0 1.1028 + ) { 1.1029 + /* found an identical block, set the other block's index value for the current block */ 1.1030 + for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1.1031 + trie->map[mapIndex++]=movedStart; 1.1032 + movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1.1033 + } 1.1034 + 1.1035 + /* advance start to the next block */ 1.1036 + start+=blockLength; 1.1037 + 1.1038 + /* leave newStart with the previous block! */ 1.1039 + continue; 1.1040 + } 1.1041 + 1.1042 + /* see if the beginning of this block can be overlapped with the end of the previous block */ 1.1043 + /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ 1.1044 + for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; 1.1045 + overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); 1.1046 + overlap-=UTRIE2_DATA_GRANULARITY) {} 1.1047 + 1.1048 + if(overlap>0 || newStart<start) { 1.1049 + /* some overlap, or just move the whole block */ 1.1050 + movedStart=newStart-overlap; 1.1051 + for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1.1052 + trie->map[mapIndex++]=movedStart; 1.1053 + movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1.1054 + } 1.1055 + 1.1056 + /* move the non-overlapping indexes to their new positions */ 1.1057 + start+=overlap; 1.1058 + for(i=blockLength-overlap; i>0; --i) { 1.1059 + trie->data[newStart++]=trie->data[start++]; 1.1060 + } 1.1061 + } else /* no overlap && newStart==start */ { 1.1062 + for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1.1063 + trie->map[mapIndex++]=start; 1.1064 + start+=UTRIE2_DATA_BLOCK_LENGTH; 1.1065 + } 1.1066 + newStart=start; 1.1067 + } 1.1068 + } 1.1069 + 1.1070 + /* now adjust the index-2 table */ 1.1071 + for(i=0; i<trie->index2Length; ++i) { 1.1072 + if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { 1.1073 + /* Gap indexes are invalid (-1). Skip over the gap. */ 1.1074 + i+=UNEWTRIE2_INDEX_GAP_LENGTH; 1.1075 + } 1.1076 + trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; 1.1077 + } 1.1078 + trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; 1.1079 + 1.1080 + /* ensure dataLength alignment */ 1.1081 + while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1.1082 + trie->data[newStart++]=trie->initialValue; 1.1083 + } 1.1084 + 1.1085 +#ifdef UTRIE2_DEBUG 1.1086 + /* we saved some space */ 1.1087 + printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", 1.1088 + (long)trie->dataLength, (long)newStart); 1.1089 +#endif 1.1090 + 1.1091 + trie->dataLength=newStart; 1.1092 +} 1.1093 + 1.1094 +static void 1.1095 +compactIndex2(UNewTrie2 *trie) { 1.1096 + int32_t i, start, newStart, movedStart, overlap; 1.1097 + 1.1098 + /* do not compact linear-BMP index-2 blocks */ 1.1099 + newStart=UTRIE2_INDEX_2_BMP_LENGTH; 1.1100 + for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { 1.1101 + trie->map[i]=start; 1.1102 + } 1.1103 + 1.1104 + /* Reduce the index table gap to what will be needed at runtime. */ 1.1105 + newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); 1.1106 + 1.1107 + for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { 1.1108 + /* 1.1109 + * start: index of first entry of current block 1.1110 + * newStart: index where the current block is to be moved 1.1111 + * (right after current end of already-compacted data) 1.1112 + */ 1.1113 + 1.1114 + /* search for an identical block */ 1.1115 + if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) 1.1116 + >=0 1.1117 + ) { 1.1118 + /* found an identical block, set the other block's index value for the current block */ 1.1119 + trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; 1.1120 + 1.1121 + /* advance start to the next block */ 1.1122 + start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1.1123 + 1.1124 + /* leave newStart with the previous block! */ 1.1125 + continue; 1.1126 + } 1.1127 + 1.1128 + /* see if the beginning of this block can be overlapped with the end of the previous block */ 1.1129 + /* look for maximum overlap with the previous, adjacent block */ 1.1130 + for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; 1.1131 + overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); 1.1132 + --overlap) {} 1.1133 + 1.1134 + if(overlap>0 || newStart<start) { 1.1135 + /* some overlap, or just move the whole block */ 1.1136 + trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; 1.1137 + 1.1138 + /* move the non-overlapping indexes to their new positions */ 1.1139 + start+=overlap; 1.1140 + for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { 1.1141 + trie->index2[newStart++]=trie->index2[start++]; 1.1142 + } 1.1143 + } else /* no overlap && newStart==start */ { 1.1144 + trie->map[start>>UTRIE2_SHIFT_1_2]=start; 1.1145 + start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1.1146 + newStart=start; 1.1147 + } 1.1148 + } 1.1149 + 1.1150 + /* now adjust the index-1 table */ 1.1151 + for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 1.1152 + trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; 1.1153 + } 1.1154 + trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; 1.1155 + 1.1156 + /* 1.1157 + * Ensure data table alignment: 1.1158 + * Needs to be granularity-aligned for 16-bit trie 1.1159 + * (so that dataMove will be down-shiftable), 1.1160 + * and 2-aligned for uint32_t data. 1.1161 + */ 1.1162 + while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { 1.1163 + /* Arbitrary value: 0x3fffc not possible for real data. */ 1.1164 + trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; 1.1165 + } 1.1166 + 1.1167 +#ifdef UTRIE2_DEBUG 1.1168 + /* we saved some space */ 1.1169 + printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", 1.1170 + (long)trie->index2Length, (long)newStart); 1.1171 +#endif 1.1172 + 1.1173 + trie->index2Length=newStart; 1.1174 +} 1.1175 + 1.1176 +static void 1.1177 +compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { 1.1178 + UNewTrie2 *newTrie; 1.1179 + UChar32 highStart, suppHighStart; 1.1180 + uint32_t highValue; 1.1181 + 1.1182 + newTrie=trie->newTrie; 1.1183 + 1.1184 + /* find highStart and round it up */ 1.1185 + highValue=utrie2_get32(trie, 0x10ffff); 1.1186 + highStart=findHighStart(newTrie, highValue); 1.1187 + highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); 1.1188 + if(highStart==0x110000) { 1.1189 + highValue=trie->errorValue; 1.1190 + } 1.1191 + 1.1192 + /* 1.1193 + * Set trie->highStart only after utrie2_get32(trie, highStart). 1.1194 + * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. 1.1195 + */ 1.1196 + trie->highStart=newTrie->highStart=highStart; 1.1197 + 1.1198 +#ifdef UTRIE2_DEBUG 1.1199 + printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", 1.1200 + (long)highStart, (long)highValue, (long)trie->initialValue); 1.1201 +#endif 1.1202 + 1.1203 + if(highStart<0x110000) { 1.1204 + /* Blank out [highStart..10ffff] to release associated data blocks. */ 1.1205 + suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; 1.1206 + utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); 1.1207 + if(U_FAILURE(*pErrorCode)) { 1.1208 + return; 1.1209 + } 1.1210 + } 1.1211 + 1.1212 + compactData(newTrie); 1.1213 + if(highStart>0x10000) { 1.1214 + compactIndex2(newTrie); 1.1215 +#ifdef UTRIE2_DEBUG 1.1216 + } else { 1.1217 + printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", 1.1218 + (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); 1.1219 +#endif 1.1220 + } 1.1221 + 1.1222 + /* 1.1223 + * Store the highValue in the data array and round up the dataLength. 1.1224 + * Must be done after compactData() because that assumes that dataLength 1.1225 + * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. 1.1226 + */ 1.1227 + newTrie->data[newTrie->dataLength++]=highValue; 1.1228 + while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1.1229 + newTrie->data[newTrie->dataLength++]=trie->initialValue; 1.1230 + } 1.1231 + 1.1232 + newTrie->isCompacted=TRUE; 1.1233 +} 1.1234 + 1.1235 +/* serialization ------------------------------------------------------------ */ 1.1236 + 1.1237 +/** 1.1238 + * Maximum length of the runtime index array. 1.1239 + * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. 1.1240 + * (The actual maximum length is lower, 1.1241 + * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) 1.1242 + */ 1.1243 +#define UTRIE2_MAX_INDEX_LENGTH 0xffff 1.1244 + 1.1245 +/** 1.1246 + * Maximum length of the runtime data array. 1.1247 + * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, 1.1248 + * and by uint16_t UTrie2Header.shiftedDataLength. 1.1249 + */ 1.1250 +#define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) 1.1251 + 1.1252 +/* Compact and internally serialize the trie. */ 1.1253 +U_CAPI void U_EXPORT2 1.1254 +utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { 1.1255 + UNewTrie2 *newTrie; 1.1256 + UTrie2Header *header; 1.1257 + uint32_t *p; 1.1258 + uint16_t *dest16; 1.1259 + int32_t i, length; 1.1260 + int32_t allIndexesLength; 1.1261 + int32_t dataMove; /* >0 if the data is moved to the end of the index array */ 1.1262 + UChar32 highStart; 1.1263 + 1.1264 + /* argument check */ 1.1265 + if(U_FAILURE(*pErrorCode)) { 1.1266 + return; 1.1267 + } 1.1268 + if( trie==NULL || 1.1269 + valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits 1.1270 + ) { 1.1271 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1272 + return; 1.1273 + } 1.1274 + newTrie=trie->newTrie; 1.1275 + if(newTrie==NULL) { 1.1276 + /* already frozen */ 1.1277 + UTrie2ValueBits frozenValueBits= 1.1278 + trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; 1.1279 + if(valueBits!=frozenValueBits) { 1.1280 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1281 + } 1.1282 + return; 1.1283 + } 1.1284 + 1.1285 + /* compact if necessary */ 1.1286 + if(!newTrie->isCompacted) { 1.1287 + compactTrie(trie, pErrorCode); 1.1288 + if(U_FAILURE(*pErrorCode)) { 1.1289 + return; 1.1290 + } 1.1291 + } 1.1292 + highStart=trie->highStart; 1.1293 + 1.1294 + if(highStart<=0x10000) { 1.1295 + allIndexesLength=UTRIE2_INDEX_1_OFFSET; 1.1296 + } else { 1.1297 + allIndexesLength=newTrie->index2Length; 1.1298 + } 1.1299 + if(valueBits==UTRIE2_16_VALUE_BITS) { 1.1300 + dataMove=allIndexesLength; 1.1301 + } else { 1.1302 + dataMove=0; 1.1303 + } 1.1304 + 1.1305 + /* are indexLength and dataLength within limits? */ 1.1306 + if( /* for unshifted indexLength */ 1.1307 + allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || 1.1308 + /* for unshifted dataNullOffset */ 1.1309 + (dataMove+newTrie->dataNullOffset)>0xffff || 1.1310 + /* for unshifted 2-byte UTF-8 index-2 values */ 1.1311 + (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || 1.1312 + /* for shiftedDataLength */ 1.1313 + (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH 1.1314 + ) { 1.1315 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.1316 + return; 1.1317 + } 1.1318 + 1.1319 + /* calculate the total serialized length */ 1.1320 + length=sizeof(UTrie2Header)+allIndexesLength*2; 1.1321 + if(valueBits==UTRIE2_16_VALUE_BITS) { 1.1322 + length+=newTrie->dataLength*2; 1.1323 + } else { 1.1324 + length+=newTrie->dataLength*4; 1.1325 + } 1.1326 + 1.1327 + trie->memory=uprv_malloc(length); 1.1328 + if(trie->memory==NULL) { 1.1329 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.1330 + return; 1.1331 + } 1.1332 + trie->length=length; 1.1333 + trie->isMemoryOwned=TRUE; 1.1334 + 1.1335 + trie->indexLength=allIndexesLength; 1.1336 + trie->dataLength=newTrie->dataLength; 1.1337 + if(highStart<=0x10000) { 1.1338 + trie->index2NullOffset=0xffff; 1.1339 + } else { 1.1340 + trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; 1.1341 + } 1.1342 + trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); 1.1343 + trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; 1.1344 + 1.1345 + /* set the header fields */ 1.1346 + header=(UTrie2Header *)trie->memory; 1.1347 + 1.1348 + header->signature=UTRIE2_SIG; /* "Tri2" */ 1.1349 + header->options=(uint16_t)valueBits; 1.1350 + 1.1351 + header->indexLength=(uint16_t)trie->indexLength; 1.1352 + header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); 1.1353 + header->index2NullOffset=trie->index2NullOffset; 1.1354 + header->dataNullOffset=trie->dataNullOffset; 1.1355 + header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); 1.1356 + 1.1357 + /* fill the index and data arrays */ 1.1358 + dest16=(uint16_t *)(header+1); 1.1359 + trie->index=dest16; 1.1360 + 1.1361 + /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ 1.1362 + p=(uint32_t *)newTrie->index2; 1.1363 + for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { 1.1364 + *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1.1365 + } 1.1366 + 1.1367 + /* write UTF-8 2-byte index-2 values, not right-shifted */ 1.1368 + for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ 1.1369 + *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); 1.1370 + } 1.1371 + for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ 1.1372 + *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); 1.1373 + } 1.1374 + 1.1375 + if(highStart>0x10000) { 1.1376 + int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; 1.1377 + int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; 1.1378 + 1.1379 + /* write 16-bit index-1 values for supplementary code points */ 1.1380 + p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 1.1381 + for(i=index1Length; i>0; --i) { 1.1382 + *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); 1.1383 + } 1.1384 + 1.1385 + /* 1.1386 + * write the index-2 array values for supplementary code points, 1.1387 + * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove 1.1388 + */ 1.1389 + p=(uint32_t *)newTrie->index2+index2Offset; 1.1390 + for(i=newTrie->index2Length-index2Offset; i>0; --i) { 1.1391 + *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1.1392 + } 1.1393 + } 1.1394 + 1.1395 + /* write the 16/32-bit data array */ 1.1396 + switch(valueBits) { 1.1397 + case UTRIE2_16_VALUE_BITS: 1.1398 + /* write 16-bit data values */ 1.1399 + trie->data16=dest16; 1.1400 + trie->data32=NULL; 1.1401 + p=newTrie->data; 1.1402 + for(i=newTrie->dataLength; i>0; --i) { 1.1403 + *dest16++=(uint16_t)*p++; 1.1404 + } 1.1405 + break; 1.1406 + case UTRIE2_32_VALUE_BITS: 1.1407 + /* write 32-bit data values */ 1.1408 + trie->data16=NULL; 1.1409 + trie->data32=(uint32_t *)dest16; 1.1410 + uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); 1.1411 + break; 1.1412 + default: 1.1413 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1414 + return; 1.1415 + } 1.1416 + 1.1417 + /* Delete the UNewTrie2. */ 1.1418 + uprv_free(newTrie->data); 1.1419 + uprv_free(newTrie); 1.1420 + trie->newTrie=NULL; 1.1421 +} 1.1422 + 1.1423 +U_CAPI UBool U_EXPORT2 1.1424 +utrie2_isFrozen(const UTrie2 *trie) { 1.1425 + return (UBool)(trie->newTrie==NULL); 1.1426 +} 1.1427 + 1.1428 +U_CAPI int32_t U_EXPORT2 1.1429 +utrie2_serialize(UTrie2 *trie, 1.1430 + void *data, int32_t capacity, 1.1431 + UErrorCode *pErrorCode) { 1.1432 + /* argument check */ 1.1433 + if(U_FAILURE(*pErrorCode)) { 1.1434 + return 0; 1.1435 + } 1.1436 + 1.1437 + if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || 1.1438 + capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0))) 1.1439 + ) { 1.1440 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1441 + return 0; 1.1442 + } 1.1443 + 1.1444 + if(capacity>=trie->length) { 1.1445 + uprv_memcpy(data, trie->memory, trie->length); 1.1446 + } else { 1.1447 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1448 + } 1.1449 + return trie->length; 1.1450 +} 1.1451 + 1.1452 +/* 1.1453 + * This is here to avoid a dependency from utrie2.cpp on utrie.c. 1.1454 + * This file already depends on utrie.c. 1.1455 + * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). 1.1456 + */ 1.1457 +U_CAPI int32_t U_EXPORT2 1.1458 +utrie2_swapAnyVersion(const UDataSwapper *ds, 1.1459 + const void *inData, int32_t length, void *outData, 1.1460 + UErrorCode *pErrorCode) { 1.1461 + if(U_SUCCESS(*pErrorCode)) { 1.1462 + switch(utrie2_getVersion(inData, length, TRUE)) { 1.1463 + case 1: 1.1464 + return utrie_swap(ds, inData, length, outData, pErrorCode); 1.1465 + case 2: 1.1466 + return utrie2_swap(ds, inData, length, outData, pErrorCode); 1.1467 + default: 1.1468 + *pErrorCode=U_INVALID_FORMAT_ERROR; 1.1469 + return 0; 1.1470 + } 1.1471 + } 1.1472 + return 0; 1.1473 +}