michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2001-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: utrie.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2001oct20 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This is a common implementation of a "folded" trie. michael@0: * It is a kind of compressed, serializable table of 16- or 32-bit values associated with michael@0: * Unicode code points (0..0x10ffff). michael@0: */ michael@0: michael@0: #ifdef UTRIE_DEBUG michael@0: # include michael@0: #endif michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "cmemory.h" michael@0: #include "utrie.h" michael@0: michael@0: /* miscellaneous ------------------------------------------------------------ */ michael@0: michael@0: #undef ABS michael@0: #define ABS(x) ((x)>=0 ? (x) : -(x)) michael@0: michael@0: static inline UBool michael@0: equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { michael@0: while(length>0 && *s==*t) { michael@0: ++s; michael@0: ++t; michael@0: --length; michael@0: } michael@0: return (UBool)(length==0); michael@0: } michael@0: michael@0: /* Building a trie ----------------------------------------------------------*/ michael@0: michael@0: U_CAPI UNewTrie * U_EXPORT2 michael@0: utrie_open(UNewTrie *fillIn, michael@0: uint32_t *aliasData, int32_t maxDataLength, michael@0: uint32_t initialValue, uint32_t leadUnitValue, michael@0: UBool latin1Linear) { michael@0: UNewTrie *trie; michael@0: int32_t i, j; michael@0: michael@0: if( maxDataLengthisAllocated= (UBool)(fillIn==NULL); michael@0: michael@0: if(aliasData!=NULL) { michael@0: trie->data=aliasData; michael@0: trie->isDataAllocated=FALSE; michael@0: } else { michael@0: trie->data=(uint32_t *)uprv_malloc(maxDataLength*4); michael@0: if(trie->data==NULL) { michael@0: uprv_free(trie); michael@0: return NULL; michael@0: } michael@0: trie->isDataAllocated=TRUE; michael@0: } michael@0: michael@0: /* preallocate and reset the first data block (block index 0) */ michael@0: j=UTRIE_DATA_BLOCK_LENGTH; michael@0: michael@0: if(latin1Linear) { michael@0: /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */ michael@0: /* made sure above that maxDataLength>=1024 */ michael@0: michael@0: /* set indexes to point to consecutive data blocks */ michael@0: i=0; michael@0: do { michael@0: /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */ michael@0: trie->index[i++]=j; michael@0: j+=UTRIE_DATA_BLOCK_LENGTH; michael@0: } while(i<(256>>UTRIE_SHIFT)); michael@0: } michael@0: michael@0: /* reset the initially allocated blocks to the initial value */ michael@0: trie->dataLength=j; michael@0: while(j>0) { michael@0: trie->data[--j]=initialValue; michael@0: } michael@0: michael@0: trie->leadUnitValue=leadUnitValue; michael@0: trie->indexLength=UTRIE_MAX_INDEX_LENGTH; michael@0: trie->dataCapacity=maxDataLength; michael@0: trie->isLatin1Linear=latin1Linear; michael@0: trie->isCompacted=FALSE; michael@0: return trie; michael@0: } michael@0: michael@0: U_CAPI UNewTrie * U_EXPORT2 michael@0: utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) { michael@0: UNewTrie *trie; michael@0: UBool isDataAllocated; michael@0: michael@0: /* do not clone if other is not valid or already compacted */ michael@0: if(other==NULL || other->data==NULL || other->isCompacted) { michael@0: return NULL; michael@0: } michael@0: michael@0: /* clone data */ michael@0: if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) { michael@0: isDataAllocated=FALSE; michael@0: } else { michael@0: aliasDataCapacity=other->dataCapacity; michael@0: aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4); michael@0: if(aliasData==NULL) { michael@0: return NULL; michael@0: } michael@0: isDataAllocated=TRUE; michael@0: } michael@0: michael@0: trie=utrie_open(fillIn, aliasData, aliasDataCapacity, michael@0: other->data[0], other->leadUnitValue, michael@0: other->isLatin1Linear); michael@0: if(trie==NULL) { michael@0: uprv_free(aliasData); michael@0: } else { michael@0: uprv_memcpy(trie->index, other->index, sizeof(trie->index)); michael@0: uprv_memcpy(trie->data, other->data, other->dataLength*4); michael@0: trie->dataLength=other->dataLength; michael@0: trie->isDataAllocated=isDataAllocated; michael@0: } michael@0: michael@0: return trie; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: utrie_close(UNewTrie *trie) { michael@0: if(trie!=NULL) { michael@0: if(trie->isDataAllocated) { michael@0: uprv_free(trie->data); michael@0: trie->data=NULL; michael@0: } michael@0: if(trie->isAllocated) { michael@0: uprv_free(trie); michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI uint32_t * U_EXPORT2 michael@0: utrie_getData(UNewTrie *trie, int32_t *pLength) { michael@0: if(trie==NULL || pLength==NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: *pLength=trie->dataLength; michael@0: return trie->data; michael@0: } michael@0: michael@0: static int32_t michael@0: utrie_allocDataBlock(UNewTrie *trie) { michael@0: int32_t newBlock, newTop; michael@0: michael@0: newBlock=trie->dataLength; michael@0: newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH; michael@0: if(newTop>trie->dataCapacity) { michael@0: /* out of memory in the data array */ michael@0: return -1; michael@0: } michael@0: trie->dataLength=newTop; michael@0: return newBlock; michael@0: } michael@0: michael@0: /** michael@0: * No error checking for illegal arguments. michael@0: * michael@0: * @return -1 if no new data block available (out of memory in data array) michael@0: * @internal michael@0: */ michael@0: static int32_t michael@0: utrie_getDataBlock(UNewTrie *trie, UChar32 c) { michael@0: int32_t indexValue, newBlock; michael@0: michael@0: c>>=UTRIE_SHIFT; michael@0: indexValue=trie->index[c]; michael@0: if(indexValue>0) { michael@0: return indexValue; michael@0: } michael@0: michael@0: /* allocate a new data block */ michael@0: newBlock=utrie_allocDataBlock(trie); michael@0: if(newBlock<0) { michael@0: /* out of memory in the data array */ michael@0: return -1; michael@0: } michael@0: trie->index[c]=newBlock; michael@0: michael@0: /* copy-on-write for a block from a setRange() */ michael@0: uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH); michael@0: return newBlock; michael@0: } michael@0: michael@0: /** michael@0: * @return TRUE if the value was successfully set michael@0: */ michael@0: U_CAPI UBool U_EXPORT2 michael@0: utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) { michael@0: int32_t block; michael@0: michael@0: /* valid, uncompacted trie and valid c? */ michael@0: if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { michael@0: return FALSE; michael@0: } michael@0: michael@0: block=utrie_getDataBlock(trie, c); michael@0: if(block<0) { michael@0: return FALSE; michael@0: } michael@0: michael@0: trie->data[block+(c&UTRIE_MASK)]=value; michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CAPI uint32_t U_EXPORT2 michael@0: utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) { michael@0: int32_t block; michael@0: michael@0: /* valid, uncompacted trie and valid c? */ michael@0: if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { michael@0: if(pInBlockZero!=NULL) { michael@0: *pInBlockZero=TRUE; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: block=trie->index[c>>UTRIE_SHIFT]; michael@0: if(pInBlockZero!=NULL) { michael@0: *pInBlockZero= (UBool)(block==0); michael@0: } michael@0: michael@0: return trie->data[ABS(block)+(c&UTRIE_MASK)]; michael@0: } michael@0: michael@0: /** michael@0: * @internal michael@0: */ michael@0: static void michael@0: utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit, michael@0: uint32_t value, uint32_t initialValue, UBool overwrite) { michael@0: uint32_t *pLimit; michael@0: michael@0: pLimit=block+limit; michael@0: block+=start; michael@0: if(overwrite) { michael@0: while(blockisCompacted || michael@0: (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit michael@0: ) { michael@0: return FALSE; michael@0: } michael@0: if(start==limit) { michael@0: return TRUE; /* nothing to do */ michael@0: } michael@0: michael@0: initialValue=trie->data[0]; michael@0: if(start&UTRIE_MASK) { michael@0: UChar32 nextStart; michael@0: michael@0: /* set partial block at [start..following block boundary[ */ michael@0: block=utrie_getDataBlock(trie, start); michael@0: if(block<0) { michael@0: return FALSE; michael@0: } michael@0: michael@0: nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK; michael@0: if(nextStart<=limit) { michael@0: utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH, michael@0: value, initialValue, overwrite); michael@0: start=nextStart; michael@0: } else { michael@0: utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK, michael@0: value, initialValue, overwrite); michael@0: return TRUE; michael@0: } michael@0: } michael@0: michael@0: /* number of positions in the last, partial block */ michael@0: rest=limit&UTRIE_MASK; michael@0: michael@0: /* round down limit to a block boundary */ michael@0: limit&=~UTRIE_MASK; michael@0: michael@0: /* iterate over all-value blocks */ michael@0: if(value==initialValue) { michael@0: repeatBlock=0; michael@0: } else { michael@0: repeatBlock=-1; michael@0: } michael@0: while(startindex[start>>UTRIE_SHIFT]; michael@0: if(block>0) { michael@0: /* already allocated, fill in value */ michael@0: utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite); michael@0: } else if(trie->data[-block]!=value && (block==0 || overwrite)) { michael@0: /* set the repeatBlock instead of the current block 0 or range block */ michael@0: if(repeatBlock>=0) { michael@0: trie->index[start>>UTRIE_SHIFT]=-repeatBlock; michael@0: } else { michael@0: /* create and set and fill the repeatBlock */ michael@0: repeatBlock=utrie_getDataBlock(trie, start); michael@0: if(repeatBlock<0) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* set the negative block number to indicate that it is a repeat block */ michael@0: trie->index[start>>UTRIE_SHIFT]=-repeatBlock; michael@0: utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE); michael@0: } michael@0: } michael@0: michael@0: start+=UTRIE_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: if(rest>0) { michael@0: /* set partial block at [last block boundary..limit[ */ michael@0: block=utrie_getDataBlock(trie, start); michael@0: if(block<0) { michael@0: return FALSE; michael@0: } michael@0: michael@0: utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite); michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static int32_t michael@0: _findSameIndexBlock(const int32_t *idx, int32_t indexLength, michael@0: int32_t otherBlock) { michael@0: int32_t block, i; michael@0: michael@0: for(block=UTRIE_BMP_INDEX_LENGTH; blockindex; michael@0: michael@0: /* copy the lead surrogate indexes into a temporary array */ michael@0: uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT); michael@0: michael@0: /* michael@0: * set all values for lead surrogate code *units* to leadUnitValue michael@0: * so that, by default, runtime lookups will find no data for associated michael@0: * supplementary code points, unless there is data for such code points michael@0: * which will result in a non-zero folding value below that is set for michael@0: * the respective lead units michael@0: * michael@0: * the above saved the indexes for surrogate code *points* michael@0: * fill the indexes with simplified code from utrie_setRange32() michael@0: */ michael@0: if(trie->leadUnitValue==trie->data[0]) { michael@0: block=0; /* leadUnitValue==initialValue, use all-initial-value block */ michael@0: } else { michael@0: /* create and fill the repeatBlock */ michael@0: block=utrie_allocDataBlock(trie); michael@0: if(block<0) { michael@0: /* data table overflow */ michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE); michael@0: block=-block; /* negative block number to indicate that it is a repeat block */ michael@0: } michael@0: for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) { michael@0: trie->index[c]=block; michael@0: } michael@0: michael@0: /* michael@0: * Fold significant index values into the area just after the BMP indexes. michael@0: * In case the first lead surrogate has significant data, michael@0: * its index block must be used first (in which case the folding is a no-op). michael@0: * Later all folded index blocks are moved up one to insert the copied michael@0: * lead surrogate indexes. michael@0: */ michael@0: indexLength=UTRIE_BMP_INDEX_LENGTH; michael@0: michael@0: /* search for any index (stage 1) entries for supplementary code points */ michael@0: for(c=0x10000; c<0x110000;) { michael@0: if(idx[c>>UTRIE_SHIFT]!=0) { michael@0: /* there is data, treat the full block for a lead surrogate */ michael@0: c&=~0x3ff; michael@0: michael@0: #ifdef UTRIE_DEBUG michael@0: ++countLeadCUWithData; michael@0: /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */ michael@0: #endif michael@0: michael@0: /* is there an identical index block? */ michael@0: block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT); michael@0: michael@0: /* michael@0: * get a folded value for [c..c+0x400[ and, michael@0: * if different from the value for the lead surrogate code point, michael@0: * set it for the lead surrogate code unit michael@0: */ michael@0: value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT); michael@0: if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) { michael@0: if(!utrie_set32(trie, U16_LEAD(c), value)) { michael@0: /* data table overflow */ michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: /* if we did not find an identical index block... */ michael@0: if(block==indexLength) { michael@0: /* move the actual index (stage 1) entries from the supplementary position to the new one */ michael@0: uprv_memmove(idx+indexLength, michael@0: idx+(c>>UTRIE_SHIFT), michael@0: 4*UTRIE_SURROGATE_BLOCK_COUNT); michael@0: indexLength+=UTRIE_SURROGATE_BLOCK_COUNT; michael@0: } michael@0: } michael@0: c+=0x400; michael@0: } else { michael@0: c+=UTRIE_DATA_BLOCK_LENGTH; michael@0: } michael@0: } michael@0: #ifdef UTRIE_DEBUG michael@0: if(countLeadCUWithData>0) { michael@0: printf("supplementary data for %d lead surrogates\n", countLeadCUWithData); michael@0: } michael@0: #endif michael@0: michael@0: /* michael@0: * index array overflow? michael@0: * This is to guarantee that a folding offset is of the form michael@0: * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. michael@0: * If the index is too large, then n>=1024 and more than 10 bits are necessary. michael@0: * michael@0: * In fact, it can only ever become n==1024 with completely unfoldable data and michael@0: * the additional block of duplicated values for lead surrogates. michael@0: */ michael@0: if(indexLength>=UTRIE_MAX_INDEX_LENGTH) { michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * make space for the lead surrogate index block and michael@0: * insert it between the BMP indexes and the folded ones michael@0: */ michael@0: uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT, michael@0: idx+UTRIE_BMP_INDEX_LENGTH, michael@0: 4*(indexLength-UTRIE_BMP_INDEX_LENGTH)); michael@0: uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH, michael@0: leadIndexes, michael@0: 4*UTRIE_SURROGATE_BLOCK_COUNT); michael@0: indexLength+=UTRIE_SURROGATE_BLOCK_COUNT; michael@0: michael@0: #ifdef UTRIE_DEBUG michael@0: printf("trie index count: BMP %ld all Unicode %ld folded %ld\n", michael@0: UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength); michael@0: #endif michael@0: michael@0: trie->indexLength=indexLength; michael@0: } michael@0: michael@0: /* michael@0: * Set a value in the trie index map to indicate which data block michael@0: * is referenced and which one is not. michael@0: * utrie_compact() will remove data blocks that are not used at all. michael@0: * Set michael@0: * - 0 if it is used michael@0: * - -1 if it is not used michael@0: */ michael@0: static void michael@0: _findUnusedBlocks(UNewTrie *trie) { michael@0: int32_t i; michael@0: michael@0: /* fill the entire map with "not used" */ michael@0: uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4); michael@0: michael@0: /* mark each block that _is_ used with 0 */ michael@0: for(i=0; iindexLength; ++i) { michael@0: trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0; michael@0: } michael@0: michael@0: /* never move the all-initial-value block 0 */ michael@0: trie->map[0]=0; michael@0: } michael@0: michael@0: static int32_t michael@0: _findSameDataBlock(const uint32_t *data, int32_t dataLength, michael@0: int32_t otherBlock, int32_t step) { michael@0: int32_t block; michael@0: michael@0: /* ensure that we do not even partially get past dataLength */ michael@0: dataLength-=UTRIE_DATA_BLOCK_LENGTH; michael@0: michael@0: for(block=0; block<=dataLength; block+=step) { michael@0: if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) { michael@0: return block; michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /* michael@0: * Compact a folded build-time trie. michael@0: * michael@0: * The compaction michael@0: * - removes blocks that are identical with earlier ones michael@0: * - overlaps adjacent blocks as much as possible (if overlap==TRUE) michael@0: * - moves blocks in steps of the data granularity michael@0: * - moves and overlaps blocks that overlap with multiple values in the overlap region michael@0: * michael@0: * It does not michael@0: * - try to move and overlap blocks that are not already adjacent michael@0: */ michael@0: static void michael@0: utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) { michael@0: int32_t i, start, newStart, overlapStart; michael@0: michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: /* valid, uncompacted trie? */ michael@0: if(trie==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: if(trie->isCompacted) { michael@0: return; /* nothing left to do */ michael@0: } michael@0: michael@0: /* compaction */ michael@0: michael@0: /* initialize the index map with "block is used/unused" flags */ michael@0: _findUnusedBlocks(trie); michael@0: michael@0: /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */ michael@0: if(trie->isLatin1Linear && UTRIE_SHIFT<=8) { michael@0: overlapStart=UTRIE_DATA_BLOCK_LENGTH+256; michael@0: } else { michael@0: overlapStart=UTRIE_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: newStart=UTRIE_DATA_BLOCK_LENGTH; michael@0: for(start=newStart; startdataLength;) { michael@0: /* michael@0: * start: index of first entry of current block michael@0: * newStart: index where the current block is to be moved michael@0: * (right after current end of already-compacted data) michael@0: */ michael@0: michael@0: /* skip blocks that are not used */ michael@0: if(trie->map[start>>UTRIE_SHIFT]<0) { michael@0: /* advance start to the next block */ michael@0: start+=UTRIE_DATA_BLOCK_LENGTH; michael@0: michael@0: /* leave newStart with the previous block! */ michael@0: continue; michael@0: } michael@0: michael@0: /* search for an identical block */ michael@0: if( start>=overlapStart && michael@0: (i=_findSameDataBlock(trie->data, newStart, start, michael@0: overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH)) michael@0: >=0 michael@0: ) { michael@0: /* found an identical block, set the other block's index value for the current block */ michael@0: trie->map[start>>UTRIE_SHIFT]=i; michael@0: michael@0: /* advance start to the next block */ michael@0: start+=UTRIE_DATA_BLOCK_LENGTH; michael@0: michael@0: /* leave newStart with the previous block! */ michael@0: continue; michael@0: } michael@0: michael@0: /* see if the beginning of this block can be overlapped with the end of the previous block */ michael@0: if(overlap && start>=overlapStart) { michael@0: /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ michael@0: for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY; michael@0: i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i); michael@0: i-=UTRIE_DATA_GRANULARITY) {} michael@0: } else { michael@0: i=0; michael@0: } michael@0: michael@0: if(i>0) { michael@0: /* some overlap */ michael@0: trie->map[start>>UTRIE_SHIFT]=newStart-i; michael@0: michael@0: /* move the non-overlapping indexes to their new positions */ michael@0: start+=i; michael@0: for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) { michael@0: trie->data[newStart++]=trie->data[start++]; michael@0: } michael@0: } else if(newStartmap[start>>UTRIE_SHIFT]=newStart; michael@0: for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) { michael@0: trie->data[newStart++]=trie->data[start++]; michael@0: } michael@0: } else /* no overlap && newStart==start */ { michael@0: trie->map[start>>UTRIE_SHIFT]=start; michael@0: newStart+=UTRIE_DATA_BLOCK_LENGTH; michael@0: start=newStart; michael@0: } michael@0: } michael@0: michael@0: /* now adjust the index (stage 1) table */ michael@0: for(i=0; iindexLength; ++i) { michael@0: trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]; michael@0: } michael@0: michael@0: #ifdef UTRIE_DEBUG michael@0: /* we saved some space */ michael@0: printf("compacting trie: count of 32-bit words %lu->%lu\n", michael@0: (long)trie->dataLength, (long)newStart); michael@0: #endif michael@0: michael@0: trie->dataLength=newStart; michael@0: } michael@0: michael@0: /* serialization ------------------------------------------------------------ */ michael@0: michael@0: /* michael@0: * Default function for the folding value: michael@0: * Just store the offset (16 bits) if there is any non-initial-value entry. michael@0: * michael@0: * The offset parameter is never 0. michael@0: * Returning the offset itself is safe for UTRIE_SHIFT>=5 because michael@0: * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800 michael@0: * which fits into 16-bit trie values; michael@0: * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases. michael@0: * michael@0: * Theoretically, it would be safer for all possible UTRIE_SHIFT including michael@0: * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS michael@0: * which would always result in a value of 0x40..0x43f michael@0: * (start/end 1k blocks of supplementary Unicode code points). michael@0: * However, this would be uglier, and would not work for some existing michael@0: * binary data file formats. michael@0: * michael@0: * Also, we do not plan to change UTRIE_SHIFT because it would change binary michael@0: * data file formats, and we would probably not make it smaller because of michael@0: * the then even larger BMP index length even for empty tries. michael@0: */ michael@0: static uint32_t U_CALLCONV michael@0: defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { michael@0: uint32_t value, initialValue; michael@0: UChar32 limit; michael@0: UBool inBlockZero; michael@0: michael@0: initialValue=trie->data[0]; michael@0: limit=start+0x400; michael@0: while(start0 && dt==NULL)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: if(getFoldedValue==NULL) { michael@0: getFoldedValue=defaultGetFoldedValue; michael@0: } michael@0: michael@0: data = (uint8_t*)dt; michael@0: /* fold and compact if necessary, also checks that indexLength is within limits */ michael@0: if(!trie->isCompacted) { michael@0: /* compact once without overlap to improve folding */ michael@0: utrie_compact(trie, FALSE, pErrorCode); michael@0: michael@0: /* fold the supplementary part of the index array */ michael@0: utrie_fold(trie, getFoldedValue, pErrorCode); michael@0: michael@0: /* compact again with overlap for minimum data array length */ michael@0: utrie_compact(trie, TRUE, pErrorCode); michael@0: michael@0: trie->isCompacted=TRUE; michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* is dataLength within limits? */ michael@0: if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) { michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: } michael@0: michael@0: length=sizeof(UTrieHeader)+2*trie->indexLength; michael@0: if(reduceTo16Bits) { michael@0: length+=2*trie->dataLength; michael@0: } else { michael@0: length+=4*trie->dataLength; michael@0: } michael@0: michael@0: if(length>capacity) { michael@0: return length; /* preflighting */ michael@0: } michael@0: michael@0: #ifdef UTRIE_DEBUG michael@0: printf("**UTrieLengths(serialize)** index:%6ld data:%6ld serialized:%6ld\n", michael@0: (long)trie->indexLength, (long)trie->dataLength, (long)length); michael@0: #endif michael@0: michael@0: /* set the header fields */ michael@0: header=(UTrieHeader *)data; michael@0: data+=sizeof(UTrieHeader); michael@0: michael@0: header->signature=0x54726965; /* "Trie" */ michael@0: header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<options|=UTRIE_OPTIONS_DATA_IS_32_BIT; michael@0: } michael@0: if(trie->isLatin1Linear) { michael@0: header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR; michael@0: } michael@0: michael@0: header->indexLength=trie->indexLength; michael@0: header->dataLength=trie->dataLength; michael@0: michael@0: /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */ michael@0: if(reduceTo16Bits) { michael@0: /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */ michael@0: p=(uint32_t *)trie->index; michael@0: dest16=(uint16_t *)data; michael@0: for(i=trie->indexLength; i>0; --i) { michael@0: *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT); michael@0: } michael@0: michael@0: /* write 16-bit data values */ michael@0: p=trie->data; michael@0: for(i=trie->dataLength; i>0; --i) { michael@0: *dest16++=(uint16_t)*p++; michael@0: } michael@0: } else { michael@0: /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */ michael@0: p=(uint32_t *)trie->index; michael@0: dest16=(uint16_t *)data; michael@0: for(i=trie->indexLength; i>0; --i) { michael@0: *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT); michael@0: } michael@0: michael@0: /* write 32-bit data values */ michael@0: uprv_memcpy(dest16, trie->data, 4*trie->dataLength); michael@0: } michael@0: michael@0: return length; michael@0: } michael@0: michael@0: /* inverse to defaultGetFoldedValue() */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie_defaultGetFoldingOffset(uint32_t data) { michael@0: return (int32_t)data; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) { michael@0: const UTrieHeader *header; michael@0: const uint16_t *p16; michael@0: uint32_t options; michael@0: michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return -1; michael@0: } michael@0: michael@0: /* enough data for a trie header? */ michael@0: if(length<(int32_t)sizeof(UTrieHeader)) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: michael@0: /* check the signature */ michael@0: header=(const UTrieHeader *)data; michael@0: if(header->signature!=0x54726965) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: michael@0: /* get the options and check the shift values */ michael@0: options=header->options; michael@0: if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT || michael@0: ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT michael@0: ) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0); michael@0: michael@0: /* get the length values */ michael@0: trie->indexLength=header->indexLength; michael@0: trie->dataLength=header->dataLength; michael@0: michael@0: length-=(int32_t)sizeof(UTrieHeader); michael@0: michael@0: /* enough data for the index? */ michael@0: if(length<2*trie->indexLength) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: p16=(const uint16_t *)(header+1); michael@0: trie->index=p16; michael@0: p16+=trie->indexLength; michael@0: length-=2*trie->indexLength; michael@0: michael@0: /* get the data */ michael@0: if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) { michael@0: if(length<4*trie->dataLength) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: trie->data32=(const uint32_t *)p16; michael@0: trie->initialValue=trie->data32[0]; michael@0: length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength; michael@0: } else { michael@0: if(length<2*trie->dataLength) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return -1; michael@0: } michael@0: michael@0: /* the "data16" data is used via the index pointer */ michael@0: trie->data32=NULL; michael@0: trie->initialValue=trie->index[trie->indexLength]; michael@0: length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength; michael@0: } michael@0: michael@0: trie->getFoldingOffset=utrie_defaultGetFoldingOffset; michael@0: michael@0: return length; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie_unserializeDummy(UTrie *trie, michael@0: void *data, int32_t length, michael@0: uint32_t initialValue, uint32_t leadUnitValue, michael@0: UBool make16BitTrie, michael@0: UErrorCode *pErrorCode) { michael@0: uint16_t *p16; michael@0: int32_t actualLength, latin1Length, i, limit; michael@0: uint16_t block; michael@0: michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return -1; michael@0: } michael@0: michael@0: /* calculate the actual size of the dummy trie data */ michael@0: michael@0: /* max(Latin-1, block 0) */ michael@0: latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/ michael@0: michael@0: trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT; michael@0: trie->dataLength=latin1Length; michael@0: if(leadUnitValue!=initialValue) { michael@0: trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH; michael@0: } michael@0: michael@0: actualLength=trie->indexLength*2; michael@0: if(make16BitTrie) { michael@0: actualLength+=trie->dataLength*2; michael@0: } else { michael@0: actualLength+=trie->dataLength*4; michael@0: } michael@0: michael@0: /* enough space for the dummy trie? */ michael@0: if(lengthisLatin1Linear=TRUE; michael@0: trie->initialValue=initialValue; michael@0: michael@0: /* fill the index and data arrays */ michael@0: p16=(uint16_t *)data; michael@0: trie->index=p16; michael@0: michael@0: if(make16BitTrie) { michael@0: /* indexes to block 0 */ michael@0: block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT); michael@0: limit=trie->indexLength; michael@0: for(i=0; i>UTRIE_INDEX_SHIFT); michael@0: i=0xd800>>UTRIE_SHIFT; michael@0: limit=0xdc00>>UTRIE_SHIFT; michael@0: for(; idata32=NULL; michael@0: michael@0: /* Latin-1 data */ michael@0: p16+=trie->indexLength; michael@0: for(i=0; iindexLength*2); michael@0: michael@0: if(leadUnitValue!=initialValue) { michael@0: /* indexes for lead surrogate code units to the block after Latin-1 */ michael@0: block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT); michael@0: i=0xd800>>UTRIE_SHIFT; michael@0: limit=0xdc00>>UTRIE_SHIFT; michael@0: for(; idata32=p32=(uint32_t *)(p16+trie->indexLength); michael@0: michael@0: /* Latin-1 data */ michael@0: for(i=0; igetFoldingOffset=utrie_defaultGetFoldingOffset; michael@0: michael@0: return actualLength; michael@0: } michael@0: michael@0: /* enumeration -------------------------------------------------------------- */ michael@0: michael@0: /* default UTrieEnumValue() returns the input value itself */ michael@0: static uint32_t U_CALLCONV michael@0: enumSameValue(const void * /*context*/, uint32_t value) { michael@0: return value; michael@0: } michael@0: michael@0: /** michael@0: * Enumerate all ranges of code points with the same relevant values. michael@0: * The values are transformed from the raw trie entries by the enumValue function. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: utrie_enum(const UTrie *trie, michael@0: UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) { michael@0: const uint32_t *data32; michael@0: const uint16_t *idx; michael@0: michael@0: uint32_t value, prevValue, initialValue; michael@0: UChar32 c, prev; michael@0: int32_t l, i, j, block, prevBlock, nullBlock, offset; michael@0: michael@0: /* check arguments */ michael@0: if(trie==NULL || trie->index==NULL || enumRange==NULL) { michael@0: return; michael@0: } michael@0: if(enumValue==NULL) { michael@0: enumValue=enumSameValue; michael@0: } michael@0: michael@0: idx=trie->index; michael@0: data32=trie->data32; michael@0: michael@0: /* get the enumeration value that corresponds to an initial-value trie data entry */ michael@0: initialValue=enumValue(context, trie->initialValue); michael@0: michael@0: if(data32==NULL) { michael@0: nullBlock=trie->indexLength; michael@0: } else { michael@0: nullBlock=0; michael@0: } michael@0: michael@0: /* set variables for previous range */ michael@0: prevBlock=nullBlock; michael@0: prev=0; michael@0: prevValue=initialValue; michael@0: michael@0: /* enumerate BMP - the main loop enumerates data blocks */ michael@0: for(i=0, c=0; c<=0xffff; ++i) { michael@0: if(c==0xd800) { michael@0: /* skip lead surrogate code _units_, go to lead surr. code _points_ */ michael@0: i=UTRIE_BMP_INDEX_LENGTH; michael@0: } else if(c==0xdc00) { michael@0: /* go back to regular BMP code points */ michael@0: i=c>>UTRIE_SHIFT; michael@0: } michael@0: michael@0: block=idx[i]<0) { michael@0: /* the block is not filled with all the same value */ michael@0: prevBlock=-1; michael@0: } michael@0: prev=c; michael@0: prevValue=value; michael@0: } michael@0: ++c; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* enumerate supplementary code points */ michael@0: for(l=0xd800; l<0xdc00;) { michael@0: /* lead surrogate access */ michael@0: offset=idx[l>>UTRIE_SHIFT]<getFoldingOffset(value); michael@0: if(offset<=0) { michael@0: /* no data for this lead surrogate */ michael@0: if(prevValue!=initialValue) { michael@0: if(prev0) { michael@0: /* the block is not filled with all the same value */ michael@0: prevBlock=-1; michael@0: } michael@0: prev=c; michael@0: prevValue=value; michael@0: } michael@0: ++c; michael@0: } michael@0: } michael@0: } while(++i