michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2001-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: utrie2_builder.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2008sep26 (split off from utrie2.c) michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This is a common implementation of a Unicode trie. michael@0: * It is a kind of compressed, serializable table of 16- or 32-bit values associated with michael@0: * Unicode code points (0..0x10ffff). michael@0: * This is the second common version of a Unicode trie (hence the name UTrie2). michael@0: * See utrie2.h for a comparison. michael@0: * michael@0: * This file contains only the builder code. michael@0: * See utrie2.c for the runtime and enumeration code. michael@0: */ michael@0: #ifdef UTRIE2_DEBUG michael@0: # include michael@0: #endif michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "cmemory.h" michael@0: #include "utrie2.h" michael@0: #include "utrie2_impl.h" michael@0: michael@0: #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: /* Implementation notes ----------------------------------------------------- */ michael@0: michael@0: /* michael@0: * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values michael@0: * have been chosen to minimize trie sizes overall. michael@0: * Most of the code is flexible enough to work with a range of values, michael@0: * within certain limits. michael@0: * michael@0: * Exception: Support for separate values for lead surrogate code _units_ michael@0: * vs. code _points_ was added after the constants were fixed, michael@0: * and has not been tested nor particularly designed for different constant values. michael@0: * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 michael@0: * part and back.) michael@0: * michael@0: * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data michael@0: * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH michael@0: * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction michael@0: * remapping) stops working. michael@0: * michael@0: * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() michael@0: * assumes that a single index-2 block is used for 0x400 code points michael@0: * corresponding to one lead surrogate. michael@0: * michael@0: * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains michael@0: * more than one Unicode plane, and the split of the index-2 table into a BMP michael@0: * part and a supplementary part, with a gap in between, would not work. michael@0: * michael@0: * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because michael@0: * there is data with more than 64k distinct values, michael@0: * for example for Unihan collation with a separate collation weight per michael@0: * Han character. michael@0: */ michael@0: michael@0: /* Building a trie ----------------------------------------------------------*/ michael@0: michael@0: enum { michael@0: /** The null index-2 block, following the gap in the index-2 table. */ michael@0: UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, michael@0: michael@0: /** The start of allocated index-2 blocks. */ michael@0: UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, michael@0: michael@0: /** michael@0: * The null data block. michael@0: * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, michael@0: * to work with 6-bit trail bytes from 2-byte UTF-8. michael@0: */ michael@0: UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, michael@0: michael@0: /** The start of allocated data blocks. */ michael@0: UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, michael@0: michael@0: /** michael@0: * The start of data blocks for U+0800 and above. michael@0: * Below, compaction uses a block length of 64 for 2-byte UTF-8. michael@0: * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. michael@0: * Data values for 0x780 code points beyond ASCII. michael@0: */ michael@0: UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 michael@0: }; michael@0: michael@0: /* Start with allocation of 16k data entries. */ michael@0: #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) michael@0: michael@0: /* Grow about 8x each time. */ michael@0: #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) michael@0: michael@0: static int32_t michael@0: allocIndex2Block(UNewTrie2 *trie); michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { michael@0: UTrie2 *trie; michael@0: UNewTrie2 *newTrie; michael@0: uint32_t *data; michael@0: int32_t i, j; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: michael@0: trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); michael@0: newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); michael@0: data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); michael@0: if(trie==NULL || newTrie==NULL || data==NULL) { michael@0: uprv_free(trie); michael@0: uprv_free(newTrie); michael@0: uprv_free(data); michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: uprv_memset(trie, 0, sizeof(UTrie2)); michael@0: trie->initialValue=initialValue; michael@0: trie->errorValue=errorValue; michael@0: trie->highStart=0x110000; michael@0: trie->newTrie=newTrie; michael@0: michael@0: newTrie->data=data; michael@0: newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; michael@0: newTrie->initialValue=initialValue; michael@0: newTrie->errorValue=errorValue; michael@0: newTrie->highStart=0x110000; michael@0: newTrie->firstFreeBlock=0; /* no free block in the list */ michael@0: newTrie->isCompacted=FALSE; michael@0: michael@0: /* michael@0: * preallocate and reset michael@0: * - ASCII michael@0: * - the bad-UTF-8-data block michael@0: * - the null data block michael@0: */ michael@0: for(i=0; i<0x80; ++i) { michael@0: newTrie->data[i]=initialValue; michael@0: } michael@0: for(; i<0xc0; ++i) { michael@0: newTrie->data[i]=errorValue; michael@0: } michael@0: for(i=UNEWTRIE2_DATA_NULL_OFFSET; idata[i]=initialValue; michael@0: } michael@0: newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; michael@0: newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; michael@0: michael@0: /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ michael@0: for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { michael@0: newTrie->index2[i]=j; michael@0: newTrie->map[i]=1; michael@0: } michael@0: /* reference counts for the bad-UTF-8-data block */ michael@0: for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { michael@0: newTrie->map[i]=0; michael@0: } michael@0: /* michael@0: * Reference counts for the null data block: all blocks except for the ASCII blocks. michael@0: * Plus 1 so that we don't drop this block during compaction. michael@0: * Plus as many as needed for lead surrogate code points. michael@0: */ michael@0: /* i==newTrie->dataNullOffset */ michael@0: newTrie->map[i++]= michael@0: (0x110000>>UTRIE2_SHIFT_2)- michael@0: (0x80>>UTRIE2_SHIFT_2)+ michael@0: 1+ michael@0: UTRIE2_LSCP_INDEX_2_LENGTH; michael@0: j+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: for(; jmap[i]=0; michael@0: } michael@0: michael@0: /* michael@0: * set the remaining indexes in the BMP index-2 block michael@0: * to the null data block michael@0: */ michael@0: for(i=0x80>>UTRIE2_SHIFT_2; iindex2[i]=UNEWTRIE2_DATA_NULL_OFFSET; michael@0: } michael@0: michael@0: /* michael@0: * Fill the index gap with impossible values so that compaction michael@0: * does not overlap other index-2 blocks with the gap. michael@0: */ michael@0: for(i=0; iindex2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; michael@0: } michael@0: michael@0: /* set the indexes in the null index-2 block */ michael@0: for(i=0; iindex2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; michael@0: } michael@0: newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; michael@0: newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; michael@0: michael@0: /* set the index-1 indexes for the linear index-2 block */ michael@0: for(i=0, j=0; michael@0: iindex1[i]=j; michael@0: } michael@0: michael@0: /* set the remaining index-1 indexes to the null index-2 block */ michael@0: for(; iindex1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; michael@0: } michael@0: michael@0: /* michael@0: * Preallocate and reset data for U+0080..U+07ff, michael@0: * for 2-byte UTF-8 which will be compacted in 64-blocks michael@0: * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. michael@0: */ michael@0: for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { michael@0: utrie2_set32(trie, i, initialValue, pErrorCode); michael@0: } michael@0: michael@0: return trie; michael@0: } michael@0: michael@0: static UNewTrie2 * michael@0: cloneBuilder(const UNewTrie2 *other) { michael@0: UNewTrie2 *trie; michael@0: michael@0: trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); michael@0: if(trie==NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); michael@0: if(trie->data==NULL) { michael@0: uprv_free(trie); michael@0: return NULL; michael@0: } michael@0: trie->dataCapacity=other->dataCapacity; michael@0: michael@0: /* clone data */ michael@0: uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); michael@0: uprv_memcpy(trie->index2, other->index2, other->index2Length*4); michael@0: trie->index2NullOffset=other->index2NullOffset; michael@0: trie->index2Length=other->index2Length; michael@0: michael@0: uprv_memcpy(trie->data, other->data, other->dataLength*4); michael@0: trie->dataNullOffset=other->dataNullOffset; michael@0: trie->dataLength=other->dataLength; michael@0: michael@0: /* reference counters */ michael@0: if(other->isCompacted) { michael@0: trie->firstFreeBlock=0; michael@0: } else { michael@0: uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4); michael@0: trie->firstFreeBlock=other->firstFreeBlock; michael@0: } michael@0: michael@0: trie->initialValue=other->initialValue; michael@0: trie->errorValue=other->errorValue; michael@0: trie->highStart=other->highStart; michael@0: trie->isCompacted=other->isCompacted; michael@0: michael@0: return trie; michael@0: } michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { michael@0: UTrie2 *trie; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); michael@0: if(trie==NULL) { michael@0: return NULL; michael@0: } michael@0: uprv_memcpy(trie, other, sizeof(UTrie2)); michael@0: michael@0: if(other->memory!=NULL) { michael@0: trie->memory=uprv_malloc(other->length); michael@0: if(trie->memory!=NULL) { michael@0: trie->isMemoryOwned=TRUE; michael@0: uprv_memcpy(trie->memory, other->memory, other->length); michael@0: michael@0: /* make the clone's pointers point to its own memory */ michael@0: trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); michael@0: if(other->data16!=NULL) { michael@0: trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); michael@0: } michael@0: if(other->data32!=NULL) { michael@0: trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); michael@0: } michael@0: } michael@0: } else /* other->newTrie!=NULL */ { michael@0: trie->newTrie=cloneBuilder(other->newTrie); michael@0: } michael@0: michael@0: if(trie->memory==NULL && trie->newTrie==NULL) { michael@0: uprv_free(trie); michael@0: trie=NULL; michael@0: } michael@0: return trie; michael@0: } michael@0: michael@0: typedef struct NewTrieAndStatus { michael@0: UTrie2 *trie; michael@0: UErrorCode errorCode; michael@0: UBool exclusiveLimit; /* rather than inclusive range end */ michael@0: } NewTrieAndStatus; michael@0: michael@0: static UBool U_CALLCONV michael@0: copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { michael@0: NewTrieAndStatus *nt=(NewTrieAndStatus *)context; michael@0: if(value!=nt->trie->initialValue) { michael@0: if(nt->exclusiveLimit) { michael@0: --end; michael@0: } michael@0: if(start==end) { michael@0: utrie2_set32(nt->trie, start, value, &nt->errorCode); michael@0: } else { michael@0: utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); michael@0: } michael@0: return U_SUCCESS(nt->errorCode); michael@0: } else { michael@0: return TRUE; michael@0: } michael@0: } michael@0: michael@0: #ifdef UTRIE2_DEBUG michael@0: static void michael@0: utrie_printLengths(const UTrie *trie) { michael@0: long indexLength=trie->indexLength; michael@0: long dataLength=(long)trie->dataLength; michael@0: long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); michael@0: printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", michael@0: indexLength, dataLength, totalLength); michael@0: } michael@0: michael@0: static void michael@0: utrie2_printLengths(const UTrie2 *trie, const char *which) { michael@0: long indexLength=trie->indexLength; michael@0: long dataLength=(long)trie->dataLength; michael@0: long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); michael@0: printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", michael@0: which, indexLength, dataLength, totalLength); michael@0: } michael@0: #endif michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { michael@0: NewTrieAndStatus context; michael@0: UChar lead; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: if(other->newTrie!=NULL && !other->newTrie->isCompacted) { michael@0: return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ michael@0: } michael@0: michael@0: /* Clone the frozen trie by enumerating it and building a new one. */ michael@0: context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: context.exclusiveLimit=FALSE; michael@0: context.errorCode=*pErrorCode; michael@0: utrie2_enum(other, NULL, copyEnumRange, &context); michael@0: *pErrorCode=context.errorCode; michael@0: for(lead=0xd800; lead<0xdc00; ++lead) { michael@0: uint32_t value; michael@0: if(other->data32==NULL) { michael@0: value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); michael@0: } else { michael@0: value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); michael@0: } michael@0: if(value!=other->initialValue) { michael@0: utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); michael@0: } michael@0: } michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: utrie2_close(context.trie); michael@0: context.trie=NULL; michael@0: } michael@0: return context.trie; michael@0: } michael@0: michael@0: /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { michael@0: NewTrieAndStatus context; michael@0: UChar lead; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: if(trie1==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return NULL; michael@0: } michael@0: context.exclusiveLimit=TRUE; michael@0: context.errorCode=*pErrorCode; michael@0: utrie_enum(trie1, NULL, copyEnumRange, &context); michael@0: *pErrorCode=context.errorCode; michael@0: for(lead=0xd800; lead<0xdc00; ++lead) { michael@0: uint32_t value; michael@0: if(trie1->data32==NULL) { michael@0: value=UTRIE_GET16_FROM_LEAD(trie1, lead); michael@0: } else { michael@0: value=UTRIE_GET32_FROM_LEAD(trie1, lead); michael@0: } michael@0: if(value!=trie1->initialValue) { michael@0: utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); michael@0: } michael@0: } michael@0: if(U_SUCCESS(*pErrorCode)) { michael@0: utrie2_freeze(context.trie, michael@0: trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, michael@0: pErrorCode); michael@0: } michael@0: #ifdef UTRIE2_DEBUG michael@0: if(U_SUCCESS(*pErrorCode)) { michael@0: utrie_printLengths(trie1); michael@0: utrie2_printLengths(context.trie, "fromUTrie"); michael@0: } michael@0: #endif michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: utrie2_close(context.trie); michael@0: context.trie=NULL; michael@0: } michael@0: return context.trie; michael@0: } michael@0: michael@0: static inline UBool michael@0: isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { michael@0: int32_t i2, block; michael@0: michael@0: if(U_IS_LEAD(c) && forLSCP) { michael@0: i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ michael@0: (c>>UTRIE2_SHIFT_2); michael@0: } else { michael@0: i2=trie->index1[c>>UTRIE2_SHIFT_1]+ michael@0: ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); michael@0: } michael@0: block=trie->index2[i2]; michael@0: return (UBool)(block==trie->dataNullOffset); michael@0: } michael@0: michael@0: static int32_t michael@0: allocIndex2Block(UNewTrie2 *trie) { michael@0: int32_t newBlock, newTop; michael@0: michael@0: newBlock=trie->index2Length; michael@0: newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; michael@0: if(newTop>LENGTHOF(trie->index2)) { michael@0: /* michael@0: * Should never occur. michael@0: * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, michael@0: * or the code writes more values than should be possible. michael@0: */ michael@0: return -1; michael@0: } michael@0: trie->index2Length=newTop; michael@0: uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); michael@0: return newBlock; michael@0: } michael@0: michael@0: static int32_t michael@0: getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { michael@0: int32_t i1, i2; michael@0: michael@0: if(U_IS_LEAD(c) && forLSCP) { michael@0: return UTRIE2_LSCP_INDEX_2_OFFSET; michael@0: } michael@0: michael@0: i1=c>>UTRIE2_SHIFT_1; michael@0: i2=trie->index1[i1]; michael@0: if(i2==trie->index2NullOffset) { michael@0: i2=allocIndex2Block(trie); michael@0: if(i2<0) { michael@0: return -1; /* program error */ michael@0: } michael@0: trie->index1[i1]=i2; michael@0: } michael@0: return i2; michael@0: } michael@0: michael@0: static int32_t michael@0: allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { michael@0: int32_t newBlock, newTop; michael@0: michael@0: if(trie->firstFreeBlock!=0) { michael@0: /* get the first free block */ michael@0: newBlock=trie->firstFreeBlock; michael@0: trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; michael@0: } else { michael@0: /* get a new block from the high end */ michael@0: newBlock=trie->dataLength; michael@0: newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; michael@0: if(newTop>trie->dataCapacity) { michael@0: /* out of memory in the data array */ michael@0: int32_t capacity; michael@0: uint32_t *data; michael@0: michael@0: if(trie->dataCapacitydataCapacitydata, trie->dataLength*4); michael@0: uprv_free(trie->data); michael@0: trie->data=data; michael@0: trie->dataCapacity=capacity; michael@0: } michael@0: trie->dataLength=newTop; michael@0: } michael@0: uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); michael@0: trie->map[newBlock>>UTRIE2_SHIFT_2]=0; michael@0: return newBlock; michael@0: } michael@0: michael@0: /* call when the block's reference counter reaches 0 */ michael@0: static void michael@0: releaseDataBlock(UNewTrie2 *trie, int32_t block) { michael@0: /* put this block at the front of the free-block chain */ michael@0: trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; michael@0: trie->firstFreeBlock=block; michael@0: } michael@0: michael@0: static inline UBool michael@0: isWritableBlock(UNewTrie2 *trie, int32_t block) { michael@0: return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); michael@0: } michael@0: michael@0: static inline void michael@0: setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { michael@0: int32_t oldBlock; michael@0: ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ michael@0: oldBlock=trie->index2[i2]; michael@0: if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { michael@0: releaseDataBlock(trie, oldBlock); michael@0: } michael@0: trie->index2[i2]=block; michael@0: } michael@0: michael@0: /** michael@0: * No error checking for illegal arguments. michael@0: * michael@0: * @return -1 if no new data block available (out of memory in data array) michael@0: * @internal michael@0: */ michael@0: static int32_t michael@0: getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { michael@0: int32_t i2, oldBlock, newBlock; michael@0: michael@0: i2=getIndex2Block(trie, c, forLSCP); michael@0: if(i2<0) { michael@0: return -1; /* program error */ michael@0: } michael@0: michael@0: i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; michael@0: oldBlock=trie->index2[i2]; michael@0: if(isWritableBlock(trie, oldBlock)) { michael@0: return oldBlock; michael@0: } michael@0: michael@0: /* allocate a new data block */ michael@0: newBlock=allocDataBlock(trie, oldBlock); michael@0: if(newBlock<0) { michael@0: /* out of memory in the data array */ michael@0: return -1; michael@0: } michael@0: setIndex2Entry(trie, i2, newBlock); michael@0: return newBlock; michael@0: } michael@0: michael@0: /** michael@0: * @return TRUE if the value was successfully set michael@0: */ michael@0: static void michael@0: set32(UNewTrie2 *trie, michael@0: UChar32 c, UBool forLSCP, uint32_t value, michael@0: UErrorCode *pErrorCode) { michael@0: int32_t block; michael@0: michael@0: if(trie==NULL || trie->isCompacted) { michael@0: *pErrorCode=U_NO_WRITE_PERMISSION; michael@0: return; michael@0: } michael@0: michael@0: block=getDataBlock(trie, c, forLSCP); michael@0: if(block<0) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: trie->data[block+(c&UTRIE2_DATA_MASK)]=value; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: if((uint32_t)c>0x10ffff) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: set32(trie->newTrie, c, TRUE, value, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, michael@0: UChar32 c, uint32_t value, michael@0: UErrorCode *pErrorCode) { michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: if(!U_IS_LEAD(c)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: set32(trie->newTrie, c, FALSE, value, pErrorCode); michael@0: } michael@0: michael@0: static void michael@0: writeBlock(uint32_t *block, uint32_t value) { michael@0: uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; michael@0: while(block0x10ffff || (uint32_t)end>0x10ffff || start>end) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: newTrie=trie->newTrie; michael@0: if(newTrie==NULL || newTrie->isCompacted) { michael@0: *pErrorCode=U_NO_WRITE_PERMISSION; michael@0: return; michael@0: } michael@0: if(!overwrite && value==newTrie->initialValue) { michael@0: return; /* nothing to do */ michael@0: } michael@0: michael@0: limit=end+1; michael@0: if(start&UTRIE2_DATA_MASK) { michael@0: UChar32 nextStart; michael@0: michael@0: /* set partial block at [start..following block boundary[ */ michael@0: block=getDataBlock(newTrie, start, TRUE); michael@0: if(block<0) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; michael@0: if(nextStart<=limit) { michael@0: fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, michael@0: value, newTrie->initialValue, overwrite); michael@0: start=nextStart; michael@0: } else { michael@0: fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, michael@0: value, newTrie->initialValue, overwrite); michael@0: return; michael@0: } michael@0: } michael@0: michael@0: /* number of positions in the last, partial block */ michael@0: rest=limit&UTRIE2_DATA_MASK; michael@0: michael@0: /* round down limit to a block boundary */ michael@0: limit&=~UTRIE2_DATA_MASK; michael@0: michael@0: /* iterate over all-value blocks */ michael@0: if(value==newTrie->initialValue) { michael@0: repeatBlock=newTrie->dataNullOffset; michael@0: } else { michael@0: repeatBlock=-1; michael@0: } michael@0: michael@0: while(startinitialValue && isInNullBlock(newTrie, start, TRUE)) { michael@0: start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ michael@0: continue; michael@0: } michael@0: michael@0: /* get index value */ michael@0: i2=getIndex2Block(newTrie, start, TRUE); michael@0: if(i2<0) { michael@0: *pErrorCode=U_INTERNAL_PROGRAM_ERROR; michael@0: return; michael@0: } michael@0: i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; michael@0: block=newTrie->index2[i2]; michael@0: if(isWritableBlock(newTrie, block)) { michael@0: /* already allocated */ michael@0: if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { michael@0: /* michael@0: * We overwrite all values, and it's not a michael@0: * protected (ASCII-linear or 2-byte UTF-8) block: michael@0: * replace with the repeatBlock. michael@0: */ michael@0: setRepeatBlock=TRUE; michael@0: } else { michael@0: /* !overwrite, or protected block: just write the values into this block */ michael@0: fillBlock(newTrie->data+block, michael@0: 0, UTRIE2_DATA_BLOCK_LENGTH, michael@0: value, newTrie->initialValue, overwrite); michael@0: } michael@0: } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { michael@0: /* michael@0: * Set the repeatBlock instead of the null block or previous repeat block: michael@0: * michael@0: * If !isWritableBlock() then all entries in the block have the same value michael@0: * because it's the null block or a range block (the repeatBlock from a previous michael@0: * call to utrie2_setRange32()). michael@0: * No other blocks are used multiple times before compacting. michael@0: * michael@0: * The null block is the only non-writable block with the initialValue because michael@0: * of the repeatBlock initialization above. (If value==initialValue, then michael@0: * the repeatBlock will be the null data block.) michael@0: * michael@0: * We set our repeatBlock if the desired value differs from the block's value, michael@0: * and if we overwrite any data or if the data is all initial values michael@0: * (which is the same as the block being the null block, see above). michael@0: */ michael@0: setRepeatBlock=TRUE; michael@0: } michael@0: if(setRepeatBlock) { michael@0: if(repeatBlock>=0) { michael@0: setIndex2Entry(newTrie, i2, repeatBlock); michael@0: } else { michael@0: /* create and set and fill the repeatBlock */ michael@0: repeatBlock=getDataBlock(newTrie, start, TRUE); michael@0: if(repeatBlock<0) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: writeBlock(newTrie->data+repeatBlock, value); michael@0: } michael@0: } michael@0: michael@0: start+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: if(rest>0) { michael@0: /* set partial block at [last block boundary..limit[ */ michael@0: block=getDataBlock(newTrie, start, TRUE); michael@0: if(block<0) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); michael@0: } michael@0: michael@0: return; michael@0: } michael@0: michael@0: /* compaction --------------------------------------------------------------- */ michael@0: michael@0: static inline UBool michael@0: equal_int32(const int32_t *s, const int32_t *t, int32_t length) { michael@0: while(length>0 && *s==*t) { michael@0: ++s; michael@0: ++t; michael@0: --length; michael@0: } michael@0: return (UBool)(length==0); michael@0: } michael@0: michael@0: static inline UBool michael@0: equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { michael@0: while(length>0 && *s==*t) { michael@0: ++s; michael@0: ++t; michael@0: --length; michael@0: } michael@0: return (UBool)(length==0); michael@0: } michael@0: michael@0: static int32_t michael@0: findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { michael@0: int32_t block; michael@0: michael@0: /* ensure that we do not even partially get past index2Length */ michael@0: index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; michael@0: michael@0: for(block=0; block<=index2Length; ++block) { michael@0: if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { michael@0: return block; michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: static int32_t michael@0: findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { michael@0: int32_t block; michael@0: michael@0: /* ensure that we do not even partially get past dataLength */ michael@0: dataLength-=blockLength; michael@0: michael@0: for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { michael@0: if(equal_uint32(data+block, data+otherBlock, blockLength)) { michael@0: return block; michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /* michael@0: * Find the start of the last range in the trie by enumerating backward. michael@0: * Indexes for supplementary code points higher than this will be omitted. michael@0: */ michael@0: static UChar32 michael@0: findHighStart(UNewTrie2 *trie, uint32_t highValue) { michael@0: const uint32_t *data32; michael@0: michael@0: uint32_t value, initialValue; michael@0: UChar32 c, prev; michael@0: int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; michael@0: michael@0: data32=trie->data; michael@0: initialValue=trie->initialValue; michael@0: michael@0: index2NullOffset=trie->index2NullOffset; michael@0: nullBlock=trie->dataNullOffset; michael@0: michael@0: /* set variables for previous range */ michael@0: if(highValue==initialValue) { michael@0: prevI2Block=index2NullOffset; michael@0: prevBlock=nullBlock; michael@0: } else { michael@0: prevI2Block=-1; michael@0: prevBlock=-1; michael@0: } michael@0: prev=0x110000; michael@0: michael@0: /* enumerate index-2 blocks */ michael@0: i1=UNEWTRIE2_INDEX_1_LENGTH; michael@0: c=prev; michael@0: while(c>0) { michael@0: i2Block=trie->index1[--i1]; michael@0: if(i2Block==prevI2Block) { michael@0: /* the index-2 block is the same as the previous one, and filled with highValue */ michael@0: c-=UTRIE2_CP_PER_INDEX_1_ENTRY; michael@0: continue; michael@0: } michael@0: prevI2Block=i2Block; michael@0: if(i2Block==index2NullOffset) { michael@0: /* this is the null index-2 block */ michael@0: if(highValue!=initialValue) { michael@0: return c; michael@0: } michael@0: c-=UTRIE2_CP_PER_INDEX_1_ENTRY; michael@0: } else { michael@0: /* enumerate data blocks for one index-2 block */ michael@0: for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { michael@0: block=trie->index2[i2Block+ --i2]; michael@0: if(block==prevBlock) { michael@0: /* the block is the same as the previous one, and filled with highValue */ michael@0: c-=UTRIE2_DATA_BLOCK_LENGTH; michael@0: continue; michael@0: } michael@0: prevBlock=block; michael@0: if(block==nullBlock) { michael@0: /* this is the null data block */ michael@0: if(highValue!=initialValue) { michael@0: return c; michael@0: } michael@0: c-=UTRIE2_DATA_BLOCK_LENGTH; michael@0: } else { michael@0: for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { michael@0: value=data32[block+ --j]; michael@0: if(value!=highValue) { michael@0: return c; michael@0: } michael@0: --c; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* deliver last range */ michael@0: return 0; michael@0: } michael@0: michael@0: /* michael@0: * Compact a build-time trie. michael@0: * michael@0: * The compaction michael@0: * - removes blocks that are identical with earlier ones michael@0: * - overlaps adjacent blocks as much as possible (if overlap==TRUE) michael@0: * - moves blocks in steps of the data granularity michael@0: * - moves and overlaps blocks that overlap with multiple values in the overlap region michael@0: * michael@0: * It does not michael@0: * - try to move and overlap blocks that are not already adjacent michael@0: */ michael@0: static void michael@0: compactData(UNewTrie2 *trie) { michael@0: int32_t start, newStart, movedStart; michael@0: int32_t blockLength, overlap; michael@0: int32_t i, mapIndex, blockCount; michael@0: michael@0: /* do not compact linear-ASCII data */ michael@0: newStart=UTRIE2_DATA_START_OFFSET; michael@0: for(start=0, i=0; startmap[i]=start; michael@0: } michael@0: michael@0: /* michael@0: * Start with a block length of 64 for 2-byte UTF-8, michael@0: * then switch to UTRIE2_DATA_BLOCK_LENGTH. michael@0: */ michael@0: blockLength=64; michael@0: blockCount=blockLength>>UTRIE2_SHIFT_2; michael@0: for(start=newStart; startdataLength;) { michael@0: /* michael@0: * start: index of first entry of current block michael@0: * newStart: index where the current block is to be moved michael@0: * (right after current end of already-compacted data) michael@0: */ michael@0: if(start==UNEWTRIE2_DATA_0800_OFFSET) { michael@0: blockLength=UTRIE2_DATA_BLOCK_LENGTH; michael@0: blockCount=1; michael@0: } michael@0: michael@0: /* skip blocks that are not used */ michael@0: if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { michael@0: /* advance start to the next block */ michael@0: start+=blockLength; michael@0: michael@0: /* leave newStart with the previous block! */ michael@0: continue; michael@0: } michael@0: michael@0: /* search for an identical block */ michael@0: if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) michael@0: >=0 michael@0: ) { michael@0: /* found an identical block, set the other block's index value for the current block */ michael@0: for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { michael@0: trie->map[mapIndex++]=movedStart; michael@0: movedStart+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: /* advance start to the next block */ michael@0: start+=blockLength; michael@0: michael@0: /* leave newStart with the previous block! */ michael@0: continue; michael@0: } michael@0: michael@0: /* see if the beginning of this block can be overlapped with the end of the previous block */ michael@0: /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ michael@0: for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; michael@0: overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); michael@0: overlap-=UTRIE2_DATA_GRANULARITY) {} michael@0: michael@0: if(overlap>0 || newStart>UTRIE2_SHIFT_2; i>0; --i) { michael@0: trie->map[mapIndex++]=movedStart; michael@0: movedStart+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: /* move the non-overlapping indexes to their new positions */ michael@0: start+=overlap; michael@0: for(i=blockLength-overlap; i>0; --i) { michael@0: trie->data[newStart++]=trie->data[start++]; michael@0: } michael@0: } else /* no overlap && newStart==start */ { michael@0: for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { michael@0: trie->map[mapIndex++]=start; michael@0: start+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: } michael@0: newStart=start; michael@0: } michael@0: } michael@0: michael@0: /* now adjust the index-2 table */ michael@0: for(i=0; iindex2Length; ++i) { michael@0: if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { michael@0: /* Gap indexes are invalid (-1). Skip over the gap. */ michael@0: i+=UNEWTRIE2_INDEX_GAP_LENGTH; michael@0: } michael@0: trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; michael@0: } michael@0: trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; michael@0: michael@0: /* ensure dataLength alignment */ michael@0: while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { michael@0: trie->data[newStart++]=trie->initialValue; michael@0: } michael@0: michael@0: #ifdef UTRIE2_DEBUG michael@0: /* we saved some space */ michael@0: printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", michael@0: (long)trie->dataLength, (long)newStart); michael@0: #endif michael@0: michael@0: trie->dataLength=newStart; michael@0: } michael@0: michael@0: static void michael@0: compactIndex2(UNewTrie2 *trie) { michael@0: int32_t i, start, newStart, movedStart, overlap; michael@0: michael@0: /* do not compact linear-BMP index-2 blocks */ michael@0: newStart=UTRIE2_INDEX_2_BMP_LENGTH; michael@0: for(start=0, i=0; startmap[i]=start; michael@0: } michael@0: michael@0: /* Reduce the index table gap to what will be needed at runtime. */ michael@0: newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); michael@0: michael@0: for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; startindex2Length;) { michael@0: /* michael@0: * start: index of first entry of current block michael@0: * newStart: index where the current block is to be moved michael@0: * (right after current end of already-compacted data) michael@0: */ michael@0: michael@0: /* search for an identical block */ michael@0: if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) michael@0: >=0 michael@0: ) { michael@0: /* found an identical block, set the other block's index value for the current block */ michael@0: trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; michael@0: michael@0: /* advance start to the next block */ michael@0: start+=UTRIE2_INDEX_2_BLOCK_LENGTH; michael@0: michael@0: /* leave newStart with the previous block! */ michael@0: continue; michael@0: } michael@0: michael@0: /* see if the beginning of this block can be overlapped with the end of the previous block */ michael@0: /* look for maximum overlap with the previous, adjacent block */ michael@0: for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; michael@0: overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); michael@0: --overlap) {} michael@0: michael@0: if(overlap>0 || newStartmap[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; michael@0: michael@0: /* move the non-overlapping indexes to their new positions */ michael@0: start+=overlap; michael@0: for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { michael@0: trie->index2[newStart++]=trie->index2[start++]; michael@0: } michael@0: } else /* no overlap && newStart==start */ { michael@0: trie->map[start>>UTRIE2_SHIFT_1_2]=start; michael@0: start+=UTRIE2_INDEX_2_BLOCK_LENGTH; michael@0: newStart=start; michael@0: } michael@0: } michael@0: michael@0: /* now adjust the index-1 table */ michael@0: for(i=0; iindex1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; michael@0: } michael@0: trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; michael@0: michael@0: /* michael@0: * Ensure data table alignment: michael@0: * Needs to be granularity-aligned for 16-bit trie michael@0: * (so that dataMove will be down-shiftable), michael@0: * and 2-aligned for uint32_t data. michael@0: */ michael@0: while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { michael@0: /* Arbitrary value: 0x3fffc not possible for real data. */ michael@0: trie->index2[newStart++]=(int32_t)0xffff<%lu\n", michael@0: (long)trie->index2Length, (long)newStart); michael@0: #endif michael@0: michael@0: trie->index2Length=newStart; michael@0: } michael@0: michael@0: static void michael@0: compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { michael@0: UNewTrie2 *newTrie; michael@0: UChar32 highStart, suppHighStart; michael@0: uint32_t highValue; michael@0: michael@0: newTrie=trie->newTrie; michael@0: michael@0: /* find highStart and round it up */ michael@0: highValue=utrie2_get32(trie, 0x10ffff); michael@0: highStart=findHighStart(newTrie, highValue); michael@0: highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); michael@0: if(highStart==0x110000) { michael@0: highValue=trie->errorValue; michael@0: } michael@0: michael@0: /* michael@0: * Set trie->highStart only after utrie2_get32(trie, highStart). michael@0: * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. michael@0: */ michael@0: trie->highStart=newTrie->highStart=highStart; michael@0: michael@0: #ifdef UTRIE2_DEBUG michael@0: printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", michael@0: (long)highStart, (long)highValue, (long)trie->initialValue); michael@0: #endif michael@0: michael@0: if(highStart<0x110000) { michael@0: /* Blank out [highStart..10ffff] to release associated data blocks. */ michael@0: suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; michael@0: utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: compactData(newTrie); michael@0: if(highStart>0x10000) { michael@0: compactIndex2(newTrie); michael@0: #ifdef UTRIE2_DEBUG michael@0: } else { michael@0: printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", michael@0: (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); michael@0: #endif michael@0: } michael@0: michael@0: /* michael@0: * Store the highValue in the data array and round up the dataLength. michael@0: * Must be done after compactData() because that assumes that dataLength michael@0: * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. michael@0: */ michael@0: newTrie->data[newTrie->dataLength++]=highValue; michael@0: while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { michael@0: newTrie->data[newTrie->dataLength++]=trie->initialValue; michael@0: } michael@0: michael@0: newTrie->isCompacted=TRUE; michael@0: } michael@0: michael@0: /* serialization ------------------------------------------------------------ */ michael@0: michael@0: /** michael@0: * Maximum length of the runtime index array. michael@0: * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. michael@0: * (The actual maximum length is lower, michael@0: * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) michael@0: */ michael@0: #define UTRIE2_MAX_INDEX_LENGTH 0xffff michael@0: michael@0: /** michael@0: * Maximum length of the runtime data array. michael@0: * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, michael@0: * and by uint16_t UTrie2Header.shiftedDataLength. michael@0: */ michael@0: #define UTRIE2_MAX_DATA_LENGTH (0xffff<0 if the data is moved to the end of the index array */ michael@0: UChar32 highStart; michael@0: michael@0: /* argument check */ michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: if( trie==NULL || michael@0: valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: newTrie=trie->newTrie; michael@0: if(newTrie==NULL) { michael@0: /* already frozen */ michael@0: UTrie2ValueBits frozenValueBits= michael@0: trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; michael@0: if(valueBits!=frozenValueBits) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: return; michael@0: } michael@0: michael@0: /* compact if necessary */ michael@0: if(!newTrie->isCompacted) { michael@0: compactTrie(trie, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: } michael@0: highStart=trie->highStart; michael@0: michael@0: if(highStart<=0x10000) { michael@0: allIndexesLength=UTRIE2_INDEX_1_OFFSET; michael@0: } else { michael@0: allIndexesLength=newTrie->index2Length; michael@0: } michael@0: if(valueBits==UTRIE2_16_VALUE_BITS) { michael@0: dataMove=allIndexesLength; michael@0: } else { michael@0: dataMove=0; michael@0: } michael@0: michael@0: /* are indexLength and dataLength within limits? */ michael@0: if( /* for unshifted indexLength */ michael@0: allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || michael@0: /* for unshifted dataNullOffset */ michael@0: (dataMove+newTrie->dataNullOffset)>0xffff || michael@0: /* for unshifted 2-byte UTF-8 index-2 values */ michael@0: (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || michael@0: /* for shiftedDataLength */ michael@0: (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH michael@0: ) { michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: michael@0: /* calculate the total serialized length */ michael@0: length=sizeof(UTrie2Header)+allIndexesLength*2; michael@0: if(valueBits==UTRIE2_16_VALUE_BITS) { michael@0: length+=newTrie->dataLength*2; michael@0: } else { michael@0: length+=newTrie->dataLength*4; michael@0: } michael@0: michael@0: trie->memory=uprv_malloc(length); michael@0: if(trie->memory==NULL) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: trie->length=length; michael@0: trie->isMemoryOwned=TRUE; michael@0: michael@0: trie->indexLength=allIndexesLength; michael@0: trie->dataLength=newTrie->dataLength; michael@0: if(highStart<=0x10000) { michael@0: trie->index2NullOffset=0xffff; michael@0: } else { michael@0: trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; michael@0: } michael@0: trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); michael@0: trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; michael@0: michael@0: /* set the header fields */ michael@0: header=(UTrie2Header *)trie->memory; michael@0: michael@0: header->signature=UTRIE2_SIG; /* "Tri2" */ michael@0: header->options=(uint16_t)valueBits; michael@0: michael@0: header->indexLength=(uint16_t)trie->indexLength; michael@0: header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); michael@0: header->index2NullOffset=trie->index2NullOffset; michael@0: header->dataNullOffset=trie->dataNullOffset; michael@0: header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); michael@0: michael@0: /* fill the index and data arrays */ michael@0: dest16=(uint16_t *)(header+1); michael@0: trie->index=dest16; michael@0: michael@0: /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ michael@0: p=(uint32_t *)newTrie->index2; michael@0: for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { michael@0: *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); michael@0: } michael@0: michael@0: /* write UTF-8 2-byte index-2 values, not right-shifted */ michael@0: for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ michael@0: *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); michael@0: } michael@0: for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ michael@0: *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); michael@0: } michael@0: michael@0: if(highStart>0x10000) { michael@0: int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; michael@0: int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; michael@0: michael@0: /* write 16-bit index-1 values for supplementary code points */ michael@0: p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; michael@0: for(i=index1Length; i>0; --i) { michael@0: *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); michael@0: } michael@0: michael@0: /* michael@0: * write the index-2 array values for supplementary code points, michael@0: * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove michael@0: */ michael@0: p=(uint32_t *)newTrie->index2+index2Offset; michael@0: for(i=newTrie->index2Length-index2Offset; i>0; --i) { michael@0: *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); michael@0: } michael@0: } michael@0: michael@0: /* write the 16/32-bit data array */ michael@0: switch(valueBits) { michael@0: case UTRIE2_16_VALUE_BITS: michael@0: /* write 16-bit data values */ michael@0: trie->data16=dest16; michael@0: trie->data32=NULL; michael@0: p=newTrie->data; michael@0: for(i=newTrie->dataLength; i>0; --i) { michael@0: *dest16++=(uint16_t)*p++; michael@0: } michael@0: break; michael@0: case UTRIE2_32_VALUE_BITS: michael@0: /* write 32-bit data values */ michael@0: trie->data16=NULL; michael@0: trie->data32=(uint32_t *)dest16; michael@0: uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); michael@0: break; michael@0: default: michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: /* Delete the UNewTrie2. */ michael@0: uprv_free(newTrie->data); michael@0: uprv_free(newTrie); michael@0: trie->newTrie=NULL; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: utrie2_isFrozen(const UTrie2 *trie) { michael@0: return (UBool)(trie->newTrie==NULL); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_serialize(UTrie2 *trie, michael@0: void *data, int32_t capacity, michael@0: UErrorCode *pErrorCode) { michael@0: /* argument check */ michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || michael@0: capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0))) michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if(capacity>=trie->length) { michael@0: uprv_memcpy(data, trie->memory, trie->length); michael@0: } else { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: return trie->length; michael@0: } michael@0: michael@0: /* michael@0: * This is here to avoid a dependency from utrie2.cpp on utrie.c. michael@0: * This file already depends on utrie.c. michael@0: * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_swapAnyVersion(const UDataSwapper *ds, michael@0: const void *inData, int32_t length, void *outData, michael@0: UErrorCode *pErrorCode) { michael@0: if(U_SUCCESS(*pErrorCode)) { michael@0: switch(utrie2_getVersion(inData, length, TRUE)) { michael@0: case 1: michael@0: return utrie_swap(ds, inData, length, outData, pErrorCode); michael@0: case 2: michael@0: return utrie2_swap(ds, inData, length, outData, pErrorCode); michael@0: default: michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: return 0; michael@0: }