Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2001-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: utrie2.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2008aug16 (starting from a copy of utrie.c) |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * This is a common implementation of a Unicode trie. |
michael@0 | 17 | * It is a kind of compressed, serializable table of 16- or 32-bit values associated with |
michael@0 | 18 | * Unicode code points (0..0x10ffff). |
michael@0 | 19 | * This is the second common version of a Unicode trie (hence the name UTrie2). |
michael@0 | 20 | * See utrie2.h for a comparison. |
michael@0 | 21 | * |
michael@0 | 22 | * This file contains only the runtime and enumeration code, for read-only access. |
michael@0 | 23 | * See utrie2_builder.c for the builder code. |
michael@0 | 24 | */ |
michael@0 | 25 | #ifdef UTRIE2_DEBUG |
michael@0 | 26 | # include <stdio.h> |
michael@0 | 27 | #endif |
michael@0 | 28 | |
michael@0 | 29 | #include "unicode/utypes.h" |
michael@0 | 30 | #include "unicode/utf.h" |
michael@0 | 31 | #include "unicode/utf8.h" |
michael@0 | 32 | #include "unicode/utf16.h" |
michael@0 | 33 | #include "cmemory.h" |
michael@0 | 34 | #include "utrie2.h" |
michael@0 | 35 | #include "utrie2_impl.h" |
michael@0 | 36 | #include "uassert.h" |
michael@0 | 37 | |
michael@0 | 38 | /* Public UTrie2 API implementation ----------------------------------------- */ |
michael@0 | 39 | |
michael@0 | 40 | static uint32_t |
michael@0 | 41 | get32(const UNewTrie2 *trie, UChar32 c, UBool fromLSCP) { |
michael@0 | 42 | int32_t i2, block; |
michael@0 | 43 | |
michael@0 | 44 | if(c>=trie->highStart && (!U_IS_LEAD(c) || fromLSCP)) { |
michael@0 | 45 | return trie->data[trie->dataLength-UTRIE2_DATA_GRANULARITY]; |
michael@0 | 46 | } |
michael@0 | 47 | |
michael@0 | 48 | if(U_IS_LEAD(c) && fromLSCP) { |
michael@0 | 49 | i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ |
michael@0 | 50 | (c>>UTRIE2_SHIFT_2); |
michael@0 | 51 | } else { |
michael@0 | 52 | i2=trie->index1[c>>UTRIE2_SHIFT_1]+ |
michael@0 | 53 | ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); |
michael@0 | 54 | } |
michael@0 | 55 | block=trie->index2[i2]; |
michael@0 | 56 | return trie->data[block+(c&UTRIE2_DATA_MASK)]; |
michael@0 | 57 | } |
michael@0 | 58 | |
michael@0 | 59 | U_CAPI uint32_t U_EXPORT2 |
michael@0 | 60 | utrie2_get32(const UTrie2 *trie, UChar32 c) { |
michael@0 | 61 | if(trie->data16!=NULL) { |
michael@0 | 62 | return UTRIE2_GET16(trie, c); |
michael@0 | 63 | } else if(trie->data32!=NULL) { |
michael@0 | 64 | return UTRIE2_GET32(trie, c); |
michael@0 | 65 | } else if((uint32_t)c>0x10ffff) { |
michael@0 | 66 | return trie->errorValue; |
michael@0 | 67 | } else { |
michael@0 | 68 | return get32(trie->newTrie, c, TRUE); |
michael@0 | 69 | } |
michael@0 | 70 | } |
michael@0 | 71 | |
michael@0 | 72 | U_CAPI uint32_t U_EXPORT2 |
michael@0 | 73 | utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2 *trie, UChar32 c) { |
michael@0 | 74 | if(!U_IS_LEAD(c)) { |
michael@0 | 75 | return trie->errorValue; |
michael@0 | 76 | } |
michael@0 | 77 | if(trie->data16!=NULL) { |
michael@0 | 78 | return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c); |
michael@0 | 79 | } else if(trie->data32!=NULL) { |
michael@0 | 80 | return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); |
michael@0 | 81 | } else { |
michael@0 | 82 | return get32(trie->newTrie, c, FALSE); |
michael@0 | 83 | } |
michael@0 | 84 | } |
michael@0 | 85 | |
michael@0 | 86 | static inline int32_t |
michael@0 | 87 | u8Index(const UTrie2 *trie, UChar32 c, int32_t i) { |
michael@0 | 88 | int32_t idx= |
michael@0 | 89 | _UTRIE2_INDEX_FROM_CP( |
michael@0 | 90 | trie, |
michael@0 | 91 | trie->data32==NULL ? trie->indexLength : 0, |
michael@0 | 92 | c); |
michael@0 | 93 | return (idx<<3)|i; |
michael@0 | 94 | } |
michael@0 | 95 | |
michael@0 | 96 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 97 | utrie2_internalU8NextIndex(const UTrie2 *trie, UChar32 c, |
michael@0 | 98 | const uint8_t *src, const uint8_t *limit) { |
michael@0 | 99 | int32_t i, length; |
michael@0 | 100 | i=0; |
michael@0 | 101 | /* support 64-bit pointers by avoiding cast of arbitrary difference */ |
michael@0 | 102 | if((limit-src)<=7) { |
michael@0 | 103 | length=(int32_t)(limit-src); |
michael@0 | 104 | } else { |
michael@0 | 105 | length=7; |
michael@0 | 106 | } |
michael@0 | 107 | c=utf8_nextCharSafeBody(src, &i, length, c, -1); |
michael@0 | 108 | return u8Index(trie, c, i); |
michael@0 | 109 | } |
michael@0 | 110 | |
michael@0 | 111 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 112 | utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c, |
michael@0 | 113 | const uint8_t *start, const uint8_t *src) { |
michael@0 | 114 | int32_t i, length; |
michael@0 | 115 | /* support 64-bit pointers by avoiding cast of arbitrary difference */ |
michael@0 | 116 | if((src-start)<=7) { |
michael@0 | 117 | i=length=(int32_t)(src-start); |
michael@0 | 118 | } else { |
michael@0 | 119 | i=length=7; |
michael@0 | 120 | start=src-7; |
michael@0 | 121 | } |
michael@0 | 122 | c=utf8_prevCharSafeBody(start, 0, &i, c, -1); |
michael@0 | 123 | i=length-i; /* number of bytes read backward from src */ |
michael@0 | 124 | return u8Index(trie, c, i); |
michael@0 | 125 | } |
michael@0 | 126 | |
michael@0 | 127 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 128 | utrie2_openFromSerialized(UTrie2ValueBits valueBits, |
michael@0 | 129 | const void *data, int32_t length, int32_t *pActualLength, |
michael@0 | 130 | UErrorCode *pErrorCode) { |
michael@0 | 131 | const UTrie2Header *header; |
michael@0 | 132 | const uint16_t *p16; |
michael@0 | 133 | int32_t actualLength; |
michael@0 | 134 | |
michael@0 | 135 | UTrie2 tempTrie; |
michael@0 | 136 | UTrie2 *trie; |
michael@0 | 137 | |
michael@0 | 138 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 139 | return 0; |
michael@0 | 140 | } |
michael@0 | 141 | |
michael@0 | 142 | if( length<=0 || (U_POINTER_MASK_LSB(data, 3)!=0) || |
michael@0 | 143 | valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits |
michael@0 | 144 | ) { |
michael@0 | 145 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 146 | return 0; |
michael@0 | 147 | } |
michael@0 | 148 | |
michael@0 | 149 | /* enough data for a trie header? */ |
michael@0 | 150 | if(length<(int32_t)sizeof(UTrie2Header)) { |
michael@0 | 151 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 152 | return 0; |
michael@0 | 153 | } |
michael@0 | 154 | |
michael@0 | 155 | /* check the signature */ |
michael@0 | 156 | header=(const UTrie2Header *)data; |
michael@0 | 157 | if(header->signature!=UTRIE2_SIG) { |
michael@0 | 158 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 159 | return 0; |
michael@0 | 160 | } |
michael@0 | 161 | |
michael@0 | 162 | /* get the options */ |
michael@0 | 163 | if(valueBits!=(UTrie2ValueBits)(header->options&UTRIE2_OPTIONS_VALUE_BITS_MASK)) { |
michael@0 | 164 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 165 | return 0; |
michael@0 | 166 | } |
michael@0 | 167 | |
michael@0 | 168 | /* get the length values and offsets */ |
michael@0 | 169 | uprv_memset(&tempTrie, 0, sizeof(tempTrie)); |
michael@0 | 170 | tempTrie.indexLength=header->indexLength; |
michael@0 | 171 | tempTrie.dataLength=header->shiftedDataLength<<UTRIE2_INDEX_SHIFT; |
michael@0 | 172 | tempTrie.index2NullOffset=header->index2NullOffset; |
michael@0 | 173 | tempTrie.dataNullOffset=header->dataNullOffset; |
michael@0 | 174 | |
michael@0 | 175 | tempTrie.highStart=header->shiftedHighStart<<UTRIE2_SHIFT_1; |
michael@0 | 176 | tempTrie.highValueIndex=tempTrie.dataLength-UTRIE2_DATA_GRANULARITY; |
michael@0 | 177 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 178 | tempTrie.highValueIndex+=tempTrie.indexLength; |
michael@0 | 179 | } |
michael@0 | 180 | |
michael@0 | 181 | /* calculate the actual length */ |
michael@0 | 182 | actualLength=(int32_t)sizeof(UTrie2Header)+tempTrie.indexLength*2; |
michael@0 | 183 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 184 | actualLength+=tempTrie.dataLength*2; |
michael@0 | 185 | } else { |
michael@0 | 186 | actualLength+=tempTrie.dataLength*4; |
michael@0 | 187 | } |
michael@0 | 188 | if(length<actualLength) { |
michael@0 | 189 | *pErrorCode=U_INVALID_FORMAT_ERROR; /* not enough bytes */ |
michael@0 | 190 | return 0; |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | /* allocate the trie */ |
michael@0 | 194 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
michael@0 | 195 | if(trie==NULL) { |
michael@0 | 196 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 197 | return 0; |
michael@0 | 198 | } |
michael@0 | 199 | uprv_memcpy(trie, &tempTrie, sizeof(tempTrie)); |
michael@0 | 200 | trie->memory=(uint32_t *)data; |
michael@0 | 201 | trie->length=actualLength; |
michael@0 | 202 | trie->isMemoryOwned=FALSE; |
michael@0 | 203 | |
michael@0 | 204 | /* set the pointers to its index and data arrays */ |
michael@0 | 205 | p16=(const uint16_t *)(header+1); |
michael@0 | 206 | trie->index=p16; |
michael@0 | 207 | p16+=trie->indexLength; |
michael@0 | 208 | |
michael@0 | 209 | /* get the data */ |
michael@0 | 210 | switch(valueBits) { |
michael@0 | 211 | case UTRIE2_16_VALUE_BITS: |
michael@0 | 212 | trie->data16=p16; |
michael@0 | 213 | trie->data32=NULL; |
michael@0 | 214 | trie->initialValue=trie->index[trie->dataNullOffset]; |
michael@0 | 215 | trie->errorValue=trie->data16[UTRIE2_BAD_UTF8_DATA_OFFSET]; |
michael@0 | 216 | break; |
michael@0 | 217 | case UTRIE2_32_VALUE_BITS: |
michael@0 | 218 | trie->data16=NULL; |
michael@0 | 219 | trie->data32=(const uint32_t *)p16; |
michael@0 | 220 | trie->initialValue=trie->data32[trie->dataNullOffset]; |
michael@0 | 221 | trie->errorValue=trie->data32[UTRIE2_BAD_UTF8_DATA_OFFSET]; |
michael@0 | 222 | break; |
michael@0 | 223 | default: |
michael@0 | 224 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 225 | return 0; |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | if(pActualLength!=NULL) { |
michael@0 | 229 | *pActualLength=actualLength; |
michael@0 | 230 | } |
michael@0 | 231 | return trie; |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | U_CAPI UTrie2 * U_EXPORT2 |
michael@0 | 235 | utrie2_openDummy(UTrie2ValueBits valueBits, |
michael@0 | 236 | uint32_t initialValue, uint32_t errorValue, |
michael@0 | 237 | UErrorCode *pErrorCode) { |
michael@0 | 238 | UTrie2 *trie; |
michael@0 | 239 | UTrie2Header *header; |
michael@0 | 240 | uint32_t *p; |
michael@0 | 241 | uint16_t *dest16; |
michael@0 | 242 | int32_t indexLength, dataLength, length, i; |
michael@0 | 243 | int32_t dataMove; /* >0 if the data is moved to the end of the index array */ |
michael@0 | 244 | |
michael@0 | 245 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 246 | return 0; |
michael@0 | 247 | } |
michael@0 | 248 | |
michael@0 | 249 | if(valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits) { |
michael@0 | 250 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 251 | return 0; |
michael@0 | 252 | } |
michael@0 | 253 | |
michael@0 | 254 | /* calculate the total length of the dummy trie data */ |
michael@0 | 255 | indexLength=UTRIE2_INDEX_1_OFFSET; |
michael@0 | 256 | dataLength=UTRIE2_DATA_START_OFFSET+UTRIE2_DATA_GRANULARITY; |
michael@0 | 257 | length=(int32_t)sizeof(UTrie2Header)+indexLength*2; |
michael@0 | 258 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 259 | length+=dataLength*2; |
michael@0 | 260 | } else { |
michael@0 | 261 | length+=dataLength*4; |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | /* allocate the trie */ |
michael@0 | 265 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
michael@0 | 266 | if(trie==NULL) { |
michael@0 | 267 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 268 | return 0; |
michael@0 | 269 | } |
michael@0 | 270 | uprv_memset(trie, 0, sizeof(UTrie2)); |
michael@0 | 271 | trie->memory=uprv_malloc(length); |
michael@0 | 272 | if(trie->memory==NULL) { |
michael@0 | 273 | uprv_free(trie); |
michael@0 | 274 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 275 | return 0; |
michael@0 | 276 | } |
michael@0 | 277 | trie->length=length; |
michael@0 | 278 | trie->isMemoryOwned=TRUE; |
michael@0 | 279 | |
michael@0 | 280 | /* set the UTrie2 fields */ |
michael@0 | 281 | if(valueBits==UTRIE2_16_VALUE_BITS) { |
michael@0 | 282 | dataMove=indexLength; |
michael@0 | 283 | } else { |
michael@0 | 284 | dataMove=0; |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | trie->indexLength=indexLength; |
michael@0 | 288 | trie->dataLength=dataLength; |
michael@0 | 289 | trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET; |
michael@0 | 290 | trie->dataNullOffset=(uint16_t)dataMove; |
michael@0 | 291 | trie->initialValue=initialValue; |
michael@0 | 292 | trie->errorValue=errorValue; |
michael@0 | 293 | trie->highStart=0; |
michael@0 | 294 | trie->highValueIndex=dataMove+UTRIE2_DATA_START_OFFSET; |
michael@0 | 295 | |
michael@0 | 296 | /* set the header fields */ |
michael@0 | 297 | header=(UTrie2Header *)trie->memory; |
michael@0 | 298 | |
michael@0 | 299 | header->signature=UTRIE2_SIG; /* "Tri2" */ |
michael@0 | 300 | header->options=(uint16_t)valueBits; |
michael@0 | 301 | |
michael@0 | 302 | header->indexLength=(uint16_t)indexLength; |
michael@0 | 303 | header->shiftedDataLength=(uint16_t)(dataLength>>UTRIE2_INDEX_SHIFT); |
michael@0 | 304 | header->index2NullOffset=(uint16_t)UTRIE2_INDEX_2_OFFSET; |
michael@0 | 305 | header->dataNullOffset=(uint16_t)dataMove; |
michael@0 | 306 | header->shiftedHighStart=0; |
michael@0 | 307 | |
michael@0 | 308 | /* fill the index and data arrays */ |
michael@0 | 309 | dest16=(uint16_t *)(header+1); |
michael@0 | 310 | trie->index=dest16; |
michael@0 | 311 | |
michael@0 | 312 | /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT */ |
michael@0 | 313 | for(i=0; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { |
michael@0 | 314 | *dest16++=(uint16_t)(dataMove>>UTRIE2_INDEX_SHIFT); /* null data block */ |
michael@0 | 315 | } |
michael@0 | 316 | |
michael@0 | 317 | /* write UTF-8 2-byte index-2 values, not right-shifted */ |
michael@0 | 318 | for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ |
michael@0 | 319 | *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); |
michael@0 | 320 | } |
michael@0 | 321 | for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ |
michael@0 | 322 | *dest16++=(uint16_t)dataMove; |
michael@0 | 323 | } |
michael@0 | 324 | |
michael@0 | 325 | /* write the 16/32-bit data array */ |
michael@0 | 326 | switch(valueBits) { |
michael@0 | 327 | case UTRIE2_16_VALUE_BITS: |
michael@0 | 328 | /* write 16-bit data values */ |
michael@0 | 329 | trie->data16=dest16; |
michael@0 | 330 | trie->data32=NULL; |
michael@0 | 331 | for(i=0; i<0x80; ++i) { |
michael@0 | 332 | *dest16++=(uint16_t)initialValue; |
michael@0 | 333 | } |
michael@0 | 334 | for(; i<0xc0; ++i) { |
michael@0 | 335 | *dest16++=(uint16_t)errorValue; |
michael@0 | 336 | } |
michael@0 | 337 | /* highValue and reserved values */ |
michael@0 | 338 | for(i=0; i<UTRIE2_DATA_GRANULARITY; ++i) { |
michael@0 | 339 | *dest16++=(uint16_t)initialValue; |
michael@0 | 340 | } |
michael@0 | 341 | break; |
michael@0 | 342 | case UTRIE2_32_VALUE_BITS: |
michael@0 | 343 | /* write 32-bit data values */ |
michael@0 | 344 | p=(uint32_t *)dest16; |
michael@0 | 345 | trie->data16=NULL; |
michael@0 | 346 | trie->data32=p; |
michael@0 | 347 | for(i=0; i<0x80; ++i) { |
michael@0 | 348 | *p++=initialValue; |
michael@0 | 349 | } |
michael@0 | 350 | for(; i<0xc0; ++i) { |
michael@0 | 351 | *p++=errorValue; |
michael@0 | 352 | } |
michael@0 | 353 | /* highValue and reserved values */ |
michael@0 | 354 | for(i=0; i<UTRIE2_DATA_GRANULARITY; ++i) { |
michael@0 | 355 | *p++=initialValue; |
michael@0 | 356 | } |
michael@0 | 357 | break; |
michael@0 | 358 | default: |
michael@0 | 359 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 360 | return 0; |
michael@0 | 361 | } |
michael@0 | 362 | |
michael@0 | 363 | return trie; |
michael@0 | 364 | } |
michael@0 | 365 | |
michael@0 | 366 | U_CAPI void U_EXPORT2 |
michael@0 | 367 | utrie2_close(UTrie2 *trie) { |
michael@0 | 368 | if(trie!=NULL) { |
michael@0 | 369 | if(trie->isMemoryOwned) { |
michael@0 | 370 | uprv_free(trie->memory); |
michael@0 | 371 | } |
michael@0 | 372 | if(trie->newTrie!=NULL) { |
michael@0 | 373 | uprv_free(trie->newTrie->data); |
michael@0 | 374 | uprv_free(trie->newTrie); |
michael@0 | 375 | } |
michael@0 | 376 | uprv_free(trie); |
michael@0 | 377 | } |
michael@0 | 378 | } |
michael@0 | 379 | |
michael@0 | 380 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 381 | utrie2_getVersion(const void *data, int32_t length, UBool anyEndianOk) { |
michael@0 | 382 | uint32_t signature; |
michael@0 | 383 | if(length<16 || data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0)) { |
michael@0 | 384 | return 0; |
michael@0 | 385 | } |
michael@0 | 386 | signature=*(const uint32_t *)data; |
michael@0 | 387 | if(signature==UTRIE2_SIG) { |
michael@0 | 388 | return 2; |
michael@0 | 389 | } |
michael@0 | 390 | if(anyEndianOk && signature==UTRIE2_OE_SIG) { |
michael@0 | 391 | return 2; |
michael@0 | 392 | } |
michael@0 | 393 | if(signature==UTRIE_SIG) { |
michael@0 | 394 | return 1; |
michael@0 | 395 | } |
michael@0 | 396 | if(anyEndianOk && signature==UTRIE_OE_SIG) { |
michael@0 | 397 | return 1; |
michael@0 | 398 | } |
michael@0 | 399 | return 0; |
michael@0 | 400 | } |
michael@0 | 401 | |
michael@0 | 402 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 403 | utrie2_swap(const UDataSwapper *ds, |
michael@0 | 404 | const void *inData, int32_t length, void *outData, |
michael@0 | 405 | UErrorCode *pErrorCode) { |
michael@0 | 406 | const UTrie2Header *inTrie; |
michael@0 | 407 | UTrie2Header trie; |
michael@0 | 408 | int32_t dataLength, size; |
michael@0 | 409 | UTrie2ValueBits valueBits; |
michael@0 | 410 | |
michael@0 | 411 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 412 | return 0; |
michael@0 | 413 | } |
michael@0 | 414 | if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) { |
michael@0 | 415 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 416 | return 0; |
michael@0 | 417 | } |
michael@0 | 418 | |
michael@0 | 419 | /* setup and swapping */ |
michael@0 | 420 | if(length>=0 && length<(int32_t)sizeof(UTrie2Header)) { |
michael@0 | 421 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 422 | return 0; |
michael@0 | 423 | } |
michael@0 | 424 | |
michael@0 | 425 | inTrie=(const UTrie2Header *)inData; |
michael@0 | 426 | trie.signature=ds->readUInt32(inTrie->signature); |
michael@0 | 427 | trie.options=ds->readUInt16(inTrie->options); |
michael@0 | 428 | trie.indexLength=ds->readUInt16(inTrie->indexLength); |
michael@0 | 429 | trie.shiftedDataLength=ds->readUInt16(inTrie->shiftedDataLength); |
michael@0 | 430 | |
michael@0 | 431 | valueBits=(UTrie2ValueBits)(trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK); |
michael@0 | 432 | dataLength=(int32_t)trie.shiftedDataLength<<UTRIE2_INDEX_SHIFT; |
michael@0 | 433 | |
michael@0 | 434 | if( trie.signature!=UTRIE2_SIG || |
michael@0 | 435 | valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits || |
michael@0 | 436 | trie.indexLength<UTRIE2_INDEX_1_OFFSET || |
michael@0 | 437 | dataLength<UTRIE2_DATA_START_OFFSET |
michael@0 | 438 | ) { |
michael@0 | 439 | *pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */ |
michael@0 | 440 | return 0; |
michael@0 | 441 | } |
michael@0 | 442 | |
michael@0 | 443 | size=sizeof(UTrie2Header)+trie.indexLength*2; |
michael@0 | 444 | switch(valueBits) { |
michael@0 | 445 | case UTRIE2_16_VALUE_BITS: |
michael@0 | 446 | size+=dataLength*2; |
michael@0 | 447 | break; |
michael@0 | 448 | case UTRIE2_32_VALUE_BITS: |
michael@0 | 449 | size+=dataLength*4; |
michael@0 | 450 | break; |
michael@0 | 451 | default: |
michael@0 | 452 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 453 | return 0; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | if(length>=0) { |
michael@0 | 457 | UTrie2Header *outTrie; |
michael@0 | 458 | |
michael@0 | 459 | if(length<size) { |
michael@0 | 460 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 461 | return 0; |
michael@0 | 462 | } |
michael@0 | 463 | |
michael@0 | 464 | outTrie=(UTrie2Header *)outData; |
michael@0 | 465 | |
michael@0 | 466 | /* swap the header */ |
michael@0 | 467 | ds->swapArray32(ds, &inTrie->signature, 4, &outTrie->signature, pErrorCode); |
michael@0 | 468 | ds->swapArray16(ds, &inTrie->options, 12, &outTrie->options, pErrorCode); |
michael@0 | 469 | |
michael@0 | 470 | /* swap the index and the data */ |
michael@0 | 471 | switch(valueBits) { |
michael@0 | 472 | case UTRIE2_16_VALUE_BITS: |
michael@0 | 473 | ds->swapArray16(ds, inTrie+1, (trie.indexLength+dataLength)*2, outTrie+1, pErrorCode); |
michael@0 | 474 | break; |
michael@0 | 475 | case UTRIE2_32_VALUE_BITS: |
michael@0 | 476 | ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode); |
michael@0 | 477 | ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, dataLength*4, |
michael@0 | 478 | (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode); |
michael@0 | 479 | break; |
michael@0 | 480 | default: |
michael@0 | 481 | *pErrorCode=U_INVALID_FORMAT_ERROR; |
michael@0 | 482 | return 0; |
michael@0 | 483 | } |
michael@0 | 484 | } |
michael@0 | 485 | |
michael@0 | 486 | return size; |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | // utrie2_swapAnyVersion() should be defined here but lives in utrie2_builder.c |
michael@0 | 490 | // to avoid a dependency from utrie2.cpp on utrie.c. |
michael@0 | 491 | |
michael@0 | 492 | /* enumeration -------------------------------------------------------------- */ |
michael@0 | 493 | |
michael@0 | 494 | #define MIN_VALUE(a, b) ((a)<(b) ? (a) : (b)) |
michael@0 | 495 | |
michael@0 | 496 | /* default UTrie2EnumValue() returns the input value itself */ |
michael@0 | 497 | static uint32_t U_CALLCONV |
michael@0 | 498 | enumSameValue(const void * /*context*/, uint32_t value) { |
michael@0 | 499 | return value; |
michael@0 | 500 | } |
michael@0 | 501 | |
michael@0 | 502 | /** |
michael@0 | 503 | * Enumerate all ranges of code points with the same relevant values. |
michael@0 | 504 | * The values are transformed from the raw trie entries by the enumValue function. |
michael@0 | 505 | * |
michael@0 | 506 | * Currently requires start<limit and both start and limit must be multiples |
michael@0 | 507 | * of UTRIE2_DATA_BLOCK_LENGTH. |
michael@0 | 508 | * |
michael@0 | 509 | * Optimizations: |
michael@0 | 510 | * - Skip a whole block if we know that it is filled with a single value, |
michael@0 | 511 | * and it is the same as we visited just before. |
michael@0 | 512 | * - Handle the null block specially because we know a priori that it is filled |
michael@0 | 513 | * with a single value. |
michael@0 | 514 | */ |
michael@0 | 515 | static void |
michael@0 | 516 | enumEitherTrie(const UTrie2 *trie, |
michael@0 | 517 | UChar32 start, UChar32 limit, |
michael@0 | 518 | UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context) { |
michael@0 | 519 | const uint32_t *data32; |
michael@0 | 520 | const uint16_t *idx; |
michael@0 | 521 | |
michael@0 | 522 | uint32_t value, prevValue, initialValue; |
michael@0 | 523 | UChar32 c, prev, highStart; |
michael@0 | 524 | int32_t j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; |
michael@0 | 525 | |
michael@0 | 526 | if(enumRange==NULL) { |
michael@0 | 527 | return; |
michael@0 | 528 | } |
michael@0 | 529 | if(enumValue==NULL) { |
michael@0 | 530 | enumValue=enumSameValue; |
michael@0 | 531 | } |
michael@0 | 532 | |
michael@0 | 533 | if(trie->newTrie==NULL) { |
michael@0 | 534 | /* frozen trie */ |
michael@0 | 535 | idx=trie->index; |
michael@0 | 536 | U_ASSERT(idx!=NULL); /* the following code assumes trie->newTrie is not NULL when idx is NULL */ |
michael@0 | 537 | data32=trie->data32; |
michael@0 | 538 | |
michael@0 | 539 | index2NullOffset=trie->index2NullOffset; |
michael@0 | 540 | nullBlock=trie->dataNullOffset; |
michael@0 | 541 | } else { |
michael@0 | 542 | /* unfrozen, mutable trie */ |
michael@0 | 543 | idx=NULL; |
michael@0 | 544 | data32=trie->newTrie->data; |
michael@0 | 545 | U_ASSERT(data32!=NULL); /* the following code assumes idx is not NULL when data32 is NULL */ |
michael@0 | 546 | |
michael@0 | 547 | index2NullOffset=trie->newTrie->index2NullOffset; |
michael@0 | 548 | nullBlock=trie->newTrie->dataNullOffset; |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | highStart=trie->highStart; |
michael@0 | 552 | |
michael@0 | 553 | /* get the enumeration value that corresponds to an initial-value trie data entry */ |
michael@0 | 554 | initialValue=enumValue(context, trie->initialValue); |
michael@0 | 555 | |
michael@0 | 556 | /* set variables for previous range */ |
michael@0 | 557 | prevI2Block=-1; |
michael@0 | 558 | prevBlock=-1; |
michael@0 | 559 | prev=start; |
michael@0 | 560 | prevValue=0; |
michael@0 | 561 | |
michael@0 | 562 | /* enumerate index-2 blocks */ |
michael@0 | 563 | for(c=start; c<limit && c<highStart;) { |
michael@0 | 564 | /* Code point limit for iterating inside this i2Block. */ |
michael@0 | 565 | UChar32 tempLimit=c+UTRIE2_CP_PER_INDEX_1_ENTRY; |
michael@0 | 566 | if(limit<tempLimit) { |
michael@0 | 567 | tempLimit=limit; |
michael@0 | 568 | } |
michael@0 | 569 | if(c<=0xffff) { |
michael@0 | 570 | if(!U_IS_SURROGATE(c)) { |
michael@0 | 571 | i2Block=c>>UTRIE2_SHIFT_2; |
michael@0 | 572 | } else if(U_IS_SURROGATE_LEAD(c)) { |
michael@0 | 573 | /* |
michael@0 | 574 | * Enumerate values for lead surrogate code points, not code units: |
michael@0 | 575 | * This special block has half the normal length. |
michael@0 | 576 | */ |
michael@0 | 577 | i2Block=UTRIE2_LSCP_INDEX_2_OFFSET; |
michael@0 | 578 | tempLimit=MIN_VALUE(0xdc00, limit); |
michael@0 | 579 | } else { |
michael@0 | 580 | /* |
michael@0 | 581 | * Switch back to the normal part of the index-2 table. |
michael@0 | 582 | * Enumerate the second half of the surrogates block. |
michael@0 | 583 | */ |
michael@0 | 584 | i2Block=0xd800>>UTRIE2_SHIFT_2; |
michael@0 | 585 | tempLimit=MIN_VALUE(0xe000, limit); |
michael@0 | 586 | } |
michael@0 | 587 | } else { |
michael@0 | 588 | /* supplementary code points */ |
michael@0 | 589 | if(idx!=NULL) { |
michael@0 | 590 | i2Block=idx[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+ |
michael@0 | 591 | (c>>UTRIE2_SHIFT_1)]; |
michael@0 | 592 | } else { |
michael@0 | 593 | i2Block=trie->newTrie->index1[c>>UTRIE2_SHIFT_1]; |
michael@0 | 594 | } |
michael@0 | 595 | if(i2Block==prevI2Block && (c-prev)>=UTRIE2_CP_PER_INDEX_1_ENTRY) { |
michael@0 | 596 | /* |
michael@0 | 597 | * The index-2 block is the same as the previous one, and filled with prevValue. |
michael@0 | 598 | * Only possible for supplementary code points because the linear-BMP index-2 |
michael@0 | 599 | * table creates unique i2Block values. |
michael@0 | 600 | */ |
michael@0 | 601 | c+=UTRIE2_CP_PER_INDEX_1_ENTRY; |
michael@0 | 602 | continue; |
michael@0 | 603 | } |
michael@0 | 604 | } |
michael@0 | 605 | prevI2Block=i2Block; |
michael@0 | 606 | if(i2Block==index2NullOffset) { |
michael@0 | 607 | /* this is the null index-2 block */ |
michael@0 | 608 | if(prevValue!=initialValue) { |
michael@0 | 609 | if(prev<c && !enumRange(context, prev, c-1, prevValue)) { |
michael@0 | 610 | return; |
michael@0 | 611 | } |
michael@0 | 612 | prevBlock=nullBlock; |
michael@0 | 613 | prev=c; |
michael@0 | 614 | prevValue=initialValue; |
michael@0 | 615 | } |
michael@0 | 616 | c+=UTRIE2_CP_PER_INDEX_1_ENTRY; |
michael@0 | 617 | } else { |
michael@0 | 618 | /* enumerate data blocks for one index-2 block */ |
michael@0 | 619 | int32_t i2, i2Limit; |
michael@0 | 620 | i2=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
michael@0 | 621 | if((c>>UTRIE2_SHIFT_1)==(tempLimit>>UTRIE2_SHIFT_1)) { |
michael@0 | 622 | i2Limit=(tempLimit>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
michael@0 | 623 | } else { |
michael@0 | 624 | i2Limit=UTRIE2_INDEX_2_BLOCK_LENGTH; |
michael@0 | 625 | } |
michael@0 | 626 | for(; i2<i2Limit; ++i2) { |
michael@0 | 627 | if(idx!=NULL) { |
michael@0 | 628 | block=(int32_t)idx[i2Block+i2]<<UTRIE2_INDEX_SHIFT; |
michael@0 | 629 | } else { |
michael@0 | 630 | block=trie->newTrie->index2[i2Block+i2]; |
michael@0 | 631 | } |
michael@0 | 632 | if(block==prevBlock && (c-prev)>=UTRIE2_DATA_BLOCK_LENGTH) { |
michael@0 | 633 | /* the block is the same as the previous one, and filled with prevValue */ |
michael@0 | 634 | c+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 635 | continue; |
michael@0 | 636 | } |
michael@0 | 637 | prevBlock=block; |
michael@0 | 638 | if(block==nullBlock) { |
michael@0 | 639 | /* this is the null data block */ |
michael@0 | 640 | if(prevValue!=initialValue) { |
michael@0 | 641 | if(prev<c && !enumRange(context, prev, c-1, prevValue)) { |
michael@0 | 642 | return; |
michael@0 | 643 | } |
michael@0 | 644 | prev=c; |
michael@0 | 645 | prevValue=initialValue; |
michael@0 | 646 | } |
michael@0 | 647 | c+=UTRIE2_DATA_BLOCK_LENGTH; |
michael@0 | 648 | } else { |
michael@0 | 649 | for(j=0; j<UTRIE2_DATA_BLOCK_LENGTH; ++j) { |
michael@0 | 650 | value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]); |
michael@0 | 651 | if(value!=prevValue) { |
michael@0 | 652 | if(prev<c && !enumRange(context, prev, c-1, prevValue)) { |
michael@0 | 653 | return; |
michael@0 | 654 | } |
michael@0 | 655 | prev=c; |
michael@0 | 656 | prevValue=value; |
michael@0 | 657 | } |
michael@0 | 658 | ++c; |
michael@0 | 659 | } |
michael@0 | 660 | } |
michael@0 | 661 | } |
michael@0 | 662 | } |
michael@0 | 663 | } |
michael@0 | 664 | |
michael@0 | 665 | if(c>limit) { |
michael@0 | 666 | c=limit; /* could be higher if in the index2NullOffset */ |
michael@0 | 667 | } else if(c<limit) { |
michael@0 | 668 | /* c==highStart<limit */ |
michael@0 | 669 | uint32_t highValue; |
michael@0 | 670 | if(idx!=NULL) { |
michael@0 | 671 | highValue= |
michael@0 | 672 | data32!=NULL ? |
michael@0 | 673 | data32[trie->highValueIndex] : |
michael@0 | 674 | idx[trie->highValueIndex]; |
michael@0 | 675 | } else { |
michael@0 | 676 | highValue=trie->newTrie->data[trie->newTrie->dataLength-UTRIE2_DATA_GRANULARITY]; |
michael@0 | 677 | } |
michael@0 | 678 | value=enumValue(context, highValue); |
michael@0 | 679 | if(value!=prevValue) { |
michael@0 | 680 | if(prev<c && !enumRange(context, prev, c-1, prevValue)) { |
michael@0 | 681 | return; |
michael@0 | 682 | } |
michael@0 | 683 | prev=c; |
michael@0 | 684 | prevValue=value; |
michael@0 | 685 | } |
michael@0 | 686 | c=limit; |
michael@0 | 687 | } |
michael@0 | 688 | |
michael@0 | 689 | /* deliver last range */ |
michael@0 | 690 | enumRange(context, prev, c-1, prevValue); |
michael@0 | 691 | } |
michael@0 | 692 | |
michael@0 | 693 | U_CAPI void U_EXPORT2 |
michael@0 | 694 | utrie2_enum(const UTrie2 *trie, |
michael@0 | 695 | UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context) { |
michael@0 | 696 | enumEitherTrie(trie, 0, 0x110000, enumValue, enumRange, context); |
michael@0 | 697 | } |
michael@0 | 698 | |
michael@0 | 699 | U_CAPI void U_EXPORT2 |
michael@0 | 700 | utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead, |
michael@0 | 701 | UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, |
michael@0 | 702 | const void *context) { |
michael@0 | 703 | if(!U16_IS_LEAD(lead)) { |
michael@0 | 704 | return; |
michael@0 | 705 | } |
michael@0 | 706 | lead=(lead-0xd7c0)<<10; /* start code point */ |
michael@0 | 707 | enumEitherTrie(trie, lead, lead+0x400, enumValue, enumRange, context); |
michael@0 | 708 | } |
michael@0 | 709 | |
michael@0 | 710 | /* C++ convenience wrappers ------------------------------------------------- */ |
michael@0 | 711 | |
michael@0 | 712 | U_NAMESPACE_BEGIN |
michael@0 | 713 | |
michael@0 | 714 | uint16_t BackwardUTrie2StringIterator::previous16() { |
michael@0 | 715 | codePointLimit=codePointStart; |
michael@0 | 716 | if(start>=codePointStart) { |
michael@0 | 717 | codePoint=U_SENTINEL; |
michael@0 | 718 | return 0; |
michael@0 | 719 | } |
michael@0 | 720 | uint16_t result; |
michael@0 | 721 | UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result); |
michael@0 | 722 | return result; |
michael@0 | 723 | } |
michael@0 | 724 | |
michael@0 | 725 | uint16_t ForwardUTrie2StringIterator::next16() { |
michael@0 | 726 | codePointStart=codePointLimit; |
michael@0 | 727 | if(codePointLimit==limit) { |
michael@0 | 728 | codePoint=U_SENTINEL; |
michael@0 | 729 | return 0; |
michael@0 | 730 | } |
michael@0 | 731 | uint16_t result; |
michael@0 | 732 | UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result); |
michael@0 | 733 | return result; |
michael@0 | 734 | } |
michael@0 | 735 | |
michael@0 | 736 | U_NAMESPACE_END |