michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2001-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: utrie2.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2008aug16 (starting from a copy of utrie.c) michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This is a common implementation of a Unicode trie. michael@0: * It is a kind of compressed, serializable table of 16- or 32-bit values associated with michael@0: * Unicode code points (0..0x10ffff). michael@0: * This is the second common version of a Unicode trie (hence the name UTrie2). michael@0: * See utrie2.h for a comparison. michael@0: * michael@0: * This file contains only the runtime and enumeration code, for read-only access. michael@0: * See utrie2_builder.c for the builder code. michael@0: */ michael@0: #ifdef UTRIE2_DEBUG michael@0: # include michael@0: #endif michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/utf.h" michael@0: #include "unicode/utf8.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "utrie2.h" michael@0: #include "utrie2_impl.h" michael@0: #include "uassert.h" michael@0: michael@0: /* Public UTrie2 API implementation ----------------------------------------- */ michael@0: michael@0: static uint32_t michael@0: get32(const UNewTrie2 *trie, UChar32 c, UBool fromLSCP) { michael@0: int32_t i2, block; michael@0: michael@0: if(c>=trie->highStart && (!U_IS_LEAD(c) || fromLSCP)) { michael@0: return trie->data[trie->dataLength-UTRIE2_DATA_GRANULARITY]; michael@0: } michael@0: michael@0: if(U_IS_LEAD(c) && fromLSCP) { michael@0: i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ michael@0: (c>>UTRIE2_SHIFT_2); michael@0: } else { michael@0: i2=trie->index1[c>>UTRIE2_SHIFT_1]+ michael@0: ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); michael@0: } michael@0: block=trie->index2[i2]; michael@0: return trie->data[block+(c&UTRIE2_DATA_MASK)]; michael@0: } michael@0: michael@0: U_CAPI uint32_t U_EXPORT2 michael@0: utrie2_get32(const UTrie2 *trie, UChar32 c) { michael@0: if(trie->data16!=NULL) { michael@0: return UTRIE2_GET16(trie, c); michael@0: } else if(trie->data32!=NULL) { michael@0: return UTRIE2_GET32(trie, c); michael@0: } else if((uint32_t)c>0x10ffff) { michael@0: return trie->errorValue; michael@0: } else { michael@0: return get32(trie->newTrie, c, TRUE); michael@0: } michael@0: } michael@0: michael@0: U_CAPI uint32_t U_EXPORT2 michael@0: utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2 *trie, UChar32 c) { michael@0: if(!U_IS_LEAD(c)) { michael@0: return trie->errorValue; michael@0: } michael@0: if(trie->data16!=NULL) { michael@0: return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c); michael@0: } else if(trie->data32!=NULL) { michael@0: return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); michael@0: } else { michael@0: return get32(trie->newTrie, c, FALSE); michael@0: } michael@0: } michael@0: michael@0: static inline int32_t michael@0: u8Index(const UTrie2 *trie, UChar32 c, int32_t i) { michael@0: int32_t idx= michael@0: _UTRIE2_INDEX_FROM_CP( michael@0: trie, michael@0: trie->data32==NULL ? trie->indexLength : 0, michael@0: c); michael@0: return (idx<<3)|i; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_internalU8NextIndex(const UTrie2 *trie, UChar32 c, michael@0: const uint8_t *src, const uint8_t *limit) { michael@0: int32_t i, length; michael@0: i=0; michael@0: /* support 64-bit pointers by avoiding cast of arbitrary difference */ michael@0: if((limit-src)<=7) { michael@0: length=(int32_t)(limit-src); michael@0: } else { michael@0: length=7; michael@0: } michael@0: c=utf8_nextCharSafeBody(src, &i, length, c, -1); michael@0: return u8Index(trie, c, i); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c, michael@0: const uint8_t *start, const uint8_t *src) { michael@0: int32_t i, length; michael@0: /* support 64-bit pointers by avoiding cast of arbitrary difference */ michael@0: if((src-start)<=7) { michael@0: i=length=(int32_t)(src-start); michael@0: } else { michael@0: i=length=7; michael@0: start=src-7; michael@0: } michael@0: c=utf8_prevCharSafeBody(start, 0, &i, c, -1); michael@0: i=length-i; /* number of bytes read backward from src */ michael@0: return u8Index(trie, c, i); michael@0: } michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_openFromSerialized(UTrie2ValueBits valueBits, michael@0: const void *data, int32_t length, int32_t *pActualLength, michael@0: UErrorCode *pErrorCode) { michael@0: const UTrie2Header *header; michael@0: const uint16_t *p16; michael@0: int32_t actualLength; michael@0: michael@0: UTrie2 tempTrie; michael@0: UTrie2 *trie; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: if( length<=0 || (U_POINTER_MASK_LSB(data, 3)!=0) || michael@0: valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* enough data for a trie header? */ michael@0: if(length<(int32_t)sizeof(UTrie2Header)) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* check the signature */ michael@0: header=(const UTrie2Header *)data; michael@0: if(header->signature!=UTRIE2_SIG) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* get the options */ michael@0: if(valueBits!=(UTrie2ValueBits)(header->options&UTRIE2_OPTIONS_VALUE_BITS_MASK)) { michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* get the length values and offsets */ michael@0: uprv_memset(&tempTrie, 0, sizeof(tempTrie)); michael@0: tempTrie.indexLength=header->indexLength; michael@0: tempTrie.dataLength=header->shiftedDataLength<index2NullOffset; michael@0: tempTrie.dataNullOffset=header->dataNullOffset; michael@0: michael@0: tempTrie.highStart=header->shiftedHighStart<memory=(uint32_t *)data; michael@0: trie->length=actualLength; michael@0: trie->isMemoryOwned=FALSE; michael@0: michael@0: /* set the pointers to its index and data arrays */ michael@0: p16=(const uint16_t *)(header+1); michael@0: trie->index=p16; michael@0: p16+=trie->indexLength; michael@0: michael@0: /* get the data */ michael@0: switch(valueBits) { michael@0: case UTRIE2_16_VALUE_BITS: michael@0: trie->data16=p16; michael@0: trie->data32=NULL; michael@0: trie->initialValue=trie->index[trie->dataNullOffset]; michael@0: trie->errorValue=trie->data16[UTRIE2_BAD_UTF8_DATA_OFFSET]; michael@0: break; michael@0: case UTRIE2_32_VALUE_BITS: michael@0: trie->data16=NULL; michael@0: trie->data32=(const uint32_t *)p16; michael@0: trie->initialValue=trie->data32[trie->dataNullOffset]; michael@0: trie->errorValue=trie->data32[UTRIE2_BAD_UTF8_DATA_OFFSET]; michael@0: break; michael@0: default: michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if(pActualLength!=NULL) { michael@0: *pActualLength=actualLength; michael@0: } michael@0: return trie; michael@0: } michael@0: michael@0: U_CAPI UTrie2 * U_EXPORT2 michael@0: utrie2_openDummy(UTrie2ValueBits valueBits, michael@0: uint32_t initialValue, uint32_t errorValue, michael@0: UErrorCode *pErrorCode) { michael@0: UTrie2 *trie; michael@0: UTrie2Header *header; michael@0: uint32_t *p; michael@0: uint16_t *dest16; michael@0: int32_t indexLength, dataLength, length, i; michael@0: int32_t dataMove; /* >0 if the data is moved to the end of the index array */ michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: if(valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* calculate the total length of the dummy trie data */ michael@0: indexLength=UTRIE2_INDEX_1_OFFSET; michael@0: dataLength=UTRIE2_DATA_START_OFFSET+UTRIE2_DATA_GRANULARITY; michael@0: length=(int32_t)sizeof(UTrie2Header)+indexLength*2; michael@0: if(valueBits==UTRIE2_16_VALUE_BITS) { michael@0: length+=dataLength*2; michael@0: } else { michael@0: length+=dataLength*4; michael@0: } michael@0: michael@0: /* allocate the trie */ michael@0: trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); michael@0: if(trie==NULL) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: uprv_memset(trie, 0, sizeof(UTrie2)); michael@0: trie->memory=uprv_malloc(length); michael@0: if(trie->memory==NULL) { michael@0: uprv_free(trie); michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: trie->length=length; michael@0: trie->isMemoryOwned=TRUE; michael@0: michael@0: /* set the UTrie2 fields */ michael@0: if(valueBits==UTRIE2_16_VALUE_BITS) { michael@0: dataMove=indexLength; michael@0: } else { michael@0: dataMove=0; michael@0: } michael@0: michael@0: trie->indexLength=indexLength; michael@0: trie->dataLength=dataLength; michael@0: trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET; michael@0: trie->dataNullOffset=(uint16_t)dataMove; michael@0: trie->initialValue=initialValue; michael@0: trie->errorValue=errorValue; michael@0: trie->highStart=0; michael@0: trie->highValueIndex=dataMove+UTRIE2_DATA_START_OFFSET; michael@0: michael@0: /* set the header fields */ michael@0: header=(UTrie2Header *)trie->memory; michael@0: michael@0: header->signature=UTRIE2_SIG; /* "Tri2" */ michael@0: header->options=(uint16_t)valueBits; michael@0: michael@0: header->indexLength=(uint16_t)indexLength; michael@0: header->shiftedDataLength=(uint16_t)(dataLength>>UTRIE2_INDEX_SHIFT); michael@0: header->index2NullOffset=(uint16_t)UTRIE2_INDEX_2_OFFSET; michael@0: header->dataNullOffset=(uint16_t)dataMove; michael@0: header->shiftedHighStart=0; michael@0: michael@0: /* fill the index and data arrays */ michael@0: dest16=(uint16_t *)(header+1); michael@0: trie->index=dest16; michael@0: michael@0: /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT */ michael@0: for(i=0; i>UTRIE2_INDEX_SHIFT); /* null data block */ michael@0: } michael@0: michael@0: /* write UTF-8 2-byte index-2 values, not right-shifted */ michael@0: for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ michael@0: *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); michael@0: } michael@0: for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ michael@0: *dest16++=(uint16_t)dataMove; michael@0: } michael@0: michael@0: /* write the 16/32-bit data array */ michael@0: switch(valueBits) { michael@0: case UTRIE2_16_VALUE_BITS: michael@0: /* write 16-bit data values */ michael@0: trie->data16=dest16; michael@0: trie->data32=NULL; michael@0: for(i=0; i<0x80; ++i) { michael@0: *dest16++=(uint16_t)initialValue; michael@0: } michael@0: for(; i<0xc0; ++i) { michael@0: *dest16++=(uint16_t)errorValue; michael@0: } michael@0: /* highValue and reserved values */ michael@0: for(i=0; idata16=NULL; michael@0: trie->data32=p; michael@0: for(i=0; i<0x80; ++i) { michael@0: *p++=initialValue; michael@0: } michael@0: for(; i<0xc0; ++i) { michael@0: *p++=errorValue; michael@0: } michael@0: /* highValue and reserved values */ michael@0: for(i=0; iisMemoryOwned) { michael@0: uprv_free(trie->memory); michael@0: } michael@0: if(trie->newTrie!=NULL) { michael@0: uprv_free(trie->newTrie->data); michael@0: uprv_free(trie->newTrie); michael@0: } michael@0: uprv_free(trie); michael@0: } michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_getVersion(const void *data, int32_t length, UBool anyEndianOk) { michael@0: uint32_t signature; michael@0: if(length<16 || data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0)) { michael@0: return 0; michael@0: } michael@0: signature=*(const uint32_t *)data; michael@0: if(signature==UTRIE2_SIG) { michael@0: return 2; michael@0: } michael@0: if(anyEndianOk && signature==UTRIE2_OE_SIG) { michael@0: return 2; michael@0: } michael@0: if(signature==UTRIE_SIG) { michael@0: return 1; michael@0: } michael@0: if(anyEndianOk && signature==UTRIE_OE_SIG) { michael@0: return 1; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: utrie2_swap(const UDataSwapper *ds, michael@0: const void *inData, int32_t length, void *outData, michael@0: UErrorCode *pErrorCode) { michael@0: const UTrie2Header *inTrie; michael@0: UTrie2Header trie; michael@0: int32_t dataLength, size; michael@0: UTrie2ValueBits valueBits; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* setup and swapping */ michael@0: if(length>=0 && length<(int32_t)sizeof(UTrie2Header)) { michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: inTrie=(const UTrie2Header *)inData; michael@0: trie.signature=ds->readUInt32(inTrie->signature); michael@0: trie.options=ds->readUInt16(inTrie->options); michael@0: trie.indexLength=ds->readUInt16(inTrie->indexLength); michael@0: trie.shiftedDataLength=ds->readUInt16(inTrie->shiftedDataLength); michael@0: michael@0: valueBits=(UTrie2ValueBits)(trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK); michael@0: dataLength=(int32_t)trie.shiftedDataLength<=0) { michael@0: UTrie2Header *outTrie; michael@0: michael@0: if(lengthswapArray32(ds, &inTrie->signature, 4, &outTrie->signature, pErrorCode); michael@0: ds->swapArray16(ds, &inTrie->options, 12, &outTrie->options, pErrorCode); michael@0: michael@0: /* swap the index and the data */ michael@0: switch(valueBits) { michael@0: case UTRIE2_16_VALUE_BITS: michael@0: ds->swapArray16(ds, inTrie+1, (trie.indexLength+dataLength)*2, outTrie+1, pErrorCode); michael@0: break; michael@0: case UTRIE2_32_VALUE_BITS: michael@0: ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode); michael@0: ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, dataLength*4, michael@0: (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode); michael@0: break; michael@0: default: michael@0: *pErrorCode=U_INVALID_FORMAT_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: return size; michael@0: } michael@0: michael@0: // utrie2_swapAnyVersion() should be defined here but lives in utrie2_builder.c michael@0: // to avoid a dependency from utrie2.cpp on utrie.c. michael@0: michael@0: /* enumeration -------------------------------------------------------------- */ michael@0: michael@0: #define MIN_VALUE(a, b) ((a)<(b) ? (a) : (b)) michael@0: michael@0: /* default UTrie2EnumValue() returns the input value itself */ michael@0: static uint32_t U_CALLCONV michael@0: enumSameValue(const void * /*context*/, uint32_t value) { michael@0: return value; michael@0: } michael@0: michael@0: /** michael@0: * Enumerate all ranges of code points with the same relevant values. michael@0: * The values are transformed from the raw trie entries by the enumValue function. michael@0: * michael@0: * Currently requires startnewTrie==NULL) { michael@0: /* frozen trie */ michael@0: idx=trie->index; michael@0: U_ASSERT(idx!=NULL); /* the following code assumes trie->newTrie is not NULL when idx is NULL */ michael@0: data32=trie->data32; michael@0: michael@0: index2NullOffset=trie->index2NullOffset; michael@0: nullBlock=trie->dataNullOffset; michael@0: } else { michael@0: /* unfrozen, mutable trie */ michael@0: idx=NULL; michael@0: data32=trie->newTrie->data; michael@0: U_ASSERT(data32!=NULL); /* the following code assumes idx is not NULL when data32 is NULL */ michael@0: michael@0: index2NullOffset=trie->newTrie->index2NullOffset; michael@0: nullBlock=trie->newTrie->dataNullOffset; michael@0: } michael@0: michael@0: highStart=trie->highStart; michael@0: michael@0: /* get the enumeration value that corresponds to an initial-value trie data entry */ michael@0: initialValue=enumValue(context, trie->initialValue); michael@0: michael@0: /* set variables for previous range */ michael@0: prevI2Block=-1; michael@0: prevBlock=-1; michael@0: prev=start; michael@0: prevValue=0; michael@0: michael@0: /* enumerate index-2 blocks */ michael@0: for(c=start; c>UTRIE2_SHIFT_2; michael@0: } else if(U_IS_SURROGATE_LEAD(c)) { michael@0: /* michael@0: * Enumerate values for lead surrogate code points, not code units: michael@0: * This special block has half the normal length. michael@0: */ michael@0: i2Block=UTRIE2_LSCP_INDEX_2_OFFSET; michael@0: tempLimit=MIN_VALUE(0xdc00, limit); michael@0: } else { michael@0: /* michael@0: * Switch back to the normal part of the index-2 table. michael@0: * Enumerate the second half of the surrogates block. michael@0: */ michael@0: i2Block=0xd800>>UTRIE2_SHIFT_2; michael@0: tempLimit=MIN_VALUE(0xe000, limit); michael@0: } michael@0: } else { michael@0: /* supplementary code points */ michael@0: if(idx!=NULL) { michael@0: i2Block=idx[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+ michael@0: (c>>UTRIE2_SHIFT_1)]; michael@0: } else { michael@0: i2Block=trie->newTrie->index1[c>>UTRIE2_SHIFT_1]; michael@0: } michael@0: if(i2Block==prevI2Block && (c-prev)>=UTRIE2_CP_PER_INDEX_1_ENTRY) { michael@0: /* michael@0: * The index-2 block is the same as the previous one, and filled with prevValue. michael@0: * Only possible for supplementary code points because the linear-BMP index-2 michael@0: * table creates unique i2Block values. michael@0: */ michael@0: c+=UTRIE2_CP_PER_INDEX_1_ENTRY; michael@0: continue; michael@0: } michael@0: } michael@0: prevI2Block=i2Block; michael@0: if(i2Block==index2NullOffset) { michael@0: /* this is the null index-2 block */ michael@0: if(prevValue!=initialValue) { michael@0: if(prev>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; michael@0: if((c>>UTRIE2_SHIFT_1)==(tempLimit>>UTRIE2_SHIFT_1)) { michael@0: i2Limit=(tempLimit>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; michael@0: } else { michael@0: i2Limit=UTRIE2_INDEX_2_BLOCK_LENGTH; michael@0: } michael@0: for(; i2newTrie->index2[i2Block+i2]; michael@0: } michael@0: if(block==prevBlock && (c-prev)>=UTRIE2_DATA_BLOCK_LENGTH) { michael@0: /* the block is the same as the previous one, and filled with prevValue */ michael@0: c+=UTRIE2_DATA_BLOCK_LENGTH; michael@0: continue; michael@0: } michael@0: prevBlock=block; michael@0: if(block==nullBlock) { michael@0: /* this is the null data block */ michael@0: if(prevValue!=initialValue) { michael@0: if(prevlimit) { michael@0: c=limit; /* could be higher if in the index2NullOffset */ michael@0: } else if(chighValueIndex] : michael@0: idx[trie->highValueIndex]; michael@0: } else { michael@0: highValue=trie->newTrie->data[trie->newTrie->dataLength-UTRIE2_DATA_GRANULARITY]; michael@0: } michael@0: value=enumValue(context, highValue); michael@0: if(value!=prevValue) { michael@0: if(prev=codePointStart) { michael@0: codePoint=U_SENTINEL; michael@0: return 0; michael@0: } michael@0: uint16_t result; michael@0: UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result); michael@0: return result; michael@0: } michael@0: michael@0: uint16_t ForwardUTrie2StringIterator::next16() { michael@0: codePointStart=codePointLimit; michael@0: if(codePointLimit==limit) { michael@0: codePoint=U_SENTINEL; michael@0: return 0; michael@0: } michael@0: uint16_t result; michael@0: UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result); michael@0: return result; michael@0: } michael@0: michael@0: U_NAMESPACE_END