1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/utrie.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,795 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2001-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: utrie.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2001nov08 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#ifndef __UTRIE_H__ 1.21 +#define __UTRIE_H__ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 +#include "unicode/utf16.h" 1.25 +#include "udataswp.h" 1.26 + 1.27 +U_CDECL_BEGIN 1.28 + 1.29 +/** 1.30 + * \file 1.31 + * 1.32 + * This is a common implementation of a "folded" trie. 1.33 + * It is a kind of compressed, serializable table of 16- or 32-bit values associated with 1.34 + * Unicode code points (0..0x10ffff). 1.35 + * 1.36 + * This implementation is optimized for getting values while walking forward 1.37 + * through a UTF-16 string. 1.38 + * Therefore, the simplest and fastest access macros are the 1.39 + * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros. 1.40 + * 1.41 + * The _FROM_BMP() macros are a little more complicated; they get values 1.42 + * even for lead surrogate code _points_, while the _FROM_LEAD() macros 1.43 + * get special "folded" values for lead surrogate code _units_ if 1.44 + * there is relevant data associated with them. 1.45 + * From such a folded value, an offset needs to be extracted to supply 1.46 + * to the _FROM_OFFSET_TRAIL() macros. 1.47 + * 1.48 + * Most of the more complex (and more convenient) functions/macros call a callback function 1.49 + * to get that offset from the folded value for a lead surrogate unit. 1.50 + */ 1.51 + 1.52 +/** 1.53 + * Trie constants, defining shift widths, index array lengths, etc. 1.54 + */ 1.55 +enum { 1.56 + /** Shift size for shifting right the input index. 1..9 */ 1.57 + UTRIE_SHIFT=5, 1.58 + 1.59 + /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */ 1.60 + UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT, 1.61 + 1.62 + /** Mask for getting the lower bits from the input index. */ 1.63 + UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1, 1.64 + 1.65 + /** 1.66 + * Lead surrogate code points' index displacement in the index array. 1.67 + * 0x10000-0xd800=0x2800 1.68 + */ 1.69 + UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT, 1.70 + 1.71 + /** 1.72 + * Shift size for shifting left the index array values. 1.73 + * Increases possible data size with 16-bit index values at the cost 1.74 + * of compactability. 1.75 + * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY. 1.76 + * 0..UTRIE_SHIFT 1.77 + */ 1.78 + UTRIE_INDEX_SHIFT=2, 1.79 + 1.80 + /** The alignment size of a stage 2 data block. Also the granularity for compaction. */ 1.81 + UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT, 1.82 + 1.83 + /** Number of bits of a trail surrogate that are used in index table lookups. */ 1.84 + UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT, 1.85 + 1.86 + /** 1.87 + * Number of index (stage 1) entries per lead surrogate. 1.88 + * Same as number of index entries for 1024 trail surrogates, 1.89 + * ==0x400>>UTRIE_SHIFT 1.90 + */ 1.91 + UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS), 1.92 + 1.93 + /** Length of the BMP portion of the index (stage 1) array. */ 1.94 + UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT 1.95 +}; 1.96 + 1.97 +/** 1.98 + * Length of the index (stage 1) array before folding. 1.99 + * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT. 1.100 + */ 1.101 +#define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT) 1.102 + 1.103 +/** 1.104 + * Maximum length of the runtime data (stage 2) array. 1.105 + * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT. 1.106 + */ 1.107 +#define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT) 1.108 + 1.109 +/** 1.110 + * Maximum length of the build-time data (stage 2) array. 1.111 + * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400. 1.112 + * (Number of Unicode code points + one all-initial-value block + 1.113 + * possible duplicate entries for 1024 lead surrogates.) 1.114 + */ 1.115 +#define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400) 1.116 + 1.117 +/** 1.118 + * Number of bytes for a dummy trie. 1.119 + * A dummy trie is an empty runtime trie, used when a real data trie cannot 1.120 + * be loaded. 1.121 + * The number of bytes works for Latin-1-linear tries with 32-bit data 1.122 + * (worst case). 1.123 + * 1.124 + * Calculation: 1.125 + * BMP index + 1 index block for lead surrogate code points + 1.126 + * Latin-1-linear array + 1 data block for lead surrogate code points 1.127 + * 1.128 + * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block } 1.129 + * 1.130 + * @see utrie_unserializeDummy 1.131 + */ 1.132 +#define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4) 1.133 + 1.134 +/** 1.135 + * Runtime UTrie callback function. 1.136 + * Extract from a lead surrogate's data the 1.137 + * index array offset of the indexes for that lead surrogate. 1.138 + * 1.139 + * @param data data value for a surrogate from the trie, including the folding offset 1.140 + * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate 1.141 + */ 1.142 +typedef int32_t U_CALLCONV 1.143 +UTrieGetFoldingOffset(uint32_t data); 1.144 + 1.145 +/** 1.146 + * Run-time Trie structure. 1.147 + * 1.148 + * Either the data table is 16 bits wide and accessed via the index 1.149 + * pointer, with each index item increased by indexLength; 1.150 + * in this case, data32==NULL. 1.151 + * 1.152 + * Or the data table is 32 bits wide and accessed via the data32 pointer. 1.153 + */ 1.154 +struct UTrie { 1.155 + const uint16_t *index; 1.156 + const uint32_t *data32; /* NULL if 16b data is used via index */ 1.157 + 1.158 + /** 1.159 + * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros. 1.160 + * If convenience macros like _GET16 or _NEXT32 are used, this function must be set. 1.161 + * 1.162 + * utrie_unserialize() sets a default function which simply returns 1.163 + * the lead surrogate's value itself - which is the inverse of the default 1.164 + * folding function used by utrie_serialize(). 1.165 + * 1.166 + * @see UTrieGetFoldingOffset 1.167 + */ 1.168 + UTrieGetFoldingOffset *getFoldingOffset; 1.169 + 1.170 + int32_t indexLength, dataLength; 1.171 + uint32_t initialValue; 1.172 + UBool isLatin1Linear; 1.173 +}; 1.174 + 1.175 +#ifndef __UTRIE2_H__ 1.176 +typedef struct UTrie UTrie; 1.177 +#endif 1.178 + 1.179 +/** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */ 1.180 +#define _UTRIE_GET_RAW(trie, data, offset, c16) \ 1.181 + (trie)->data[ \ 1.182 + ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \ 1.183 + ((c16)&UTRIE_MASK) \ 1.184 + ] 1.185 + 1.186 +/** Internal trie getter from a pair of surrogates */ 1.187 +#define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) { \ 1.188 + int32_t __offset; \ 1.189 +\ 1.190 + /* get data for lead surrogate */ \ 1.191 + (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 1.192 + __offset=(trie)->getFoldingOffset(result); \ 1.193 +\ 1.194 + /* get the real data from the folded lead/trail units */ \ 1.195 + if(__offset>0) { \ 1.196 + (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \ 1.197 + } else { \ 1.198 + (result)=(resultType)((trie)->initialValue); \ 1.199 + } \ 1.200 +} 1.201 + 1.202 +/** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */ 1.203 +#define _UTRIE_GET_FROM_BMP(trie, data, c16) \ 1.204 + _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16); 1.205 + 1.206 +/** 1.207 + * Internal trie getter from a code point. 1.208 + * Could be faster(?) but longer with 1.209 + * if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); } 1.210 + */ 1.211 +#define _UTRIE_GET(trie, data, c32, result, resultType) \ 1.212 + if((uint32_t)(c32)<=0xffff) { \ 1.213 + /* BMP code points */ \ 1.214 + (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \ 1.215 + } else if((uint32_t)(c32)<=0x10ffff) { \ 1.216 + /* supplementary code point */ \ 1.217 + UChar __lead16=U16_LEAD(c32); \ 1.218 + _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \ 1.219 + } else { \ 1.220 + /* out of range */ \ 1.221 + (result)=(resultType)((trie)->initialValue); \ 1.222 + } 1.223 + 1.224 +/** Internal next-post-increment: get the next code point (c, c2) and its data */ 1.225 +#define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) { \ 1.226 + (c)=*(src)++; \ 1.227 + if(!U16_IS_LEAD(c)) { \ 1.228 + (c2)=0; \ 1.229 + (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 1.230 + } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \ 1.231 + ++(src); \ 1.232 + _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \ 1.233 + } else { \ 1.234 + /* unpaired lead surrogate code point */ \ 1.235 + (c2)=0; \ 1.236 + (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \ 1.237 + } \ 1.238 +} 1.239 + 1.240 +/** Internal previous: get the previous code point (c, c2) and its data */ 1.241 +#define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) { \ 1.242 + (c)=*--(src); \ 1.243 + if(!U16_IS_SURROGATE(c)) { \ 1.244 + (c2)=0; \ 1.245 + (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 1.246 + } else if(!U16_IS_SURROGATE_LEAD(c)) { \ 1.247 + /* trail surrogate */ \ 1.248 + if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \ 1.249 + --(src); \ 1.250 + (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \ 1.251 + _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \ 1.252 + } else { \ 1.253 + /* unpaired trail surrogate code point */ \ 1.254 + (c2)=0; \ 1.255 + (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 1.256 + } \ 1.257 + } else { \ 1.258 + /* unpaired lead surrogate code point */ \ 1.259 + (c2)=0; \ 1.260 + (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \ 1.261 + } \ 1.262 +} 1.263 + 1.264 +/* Public UTrie API ---------------------------------------------------------*/ 1.265 + 1.266 +/** 1.267 + * Get a pointer to the contiguous part of the data array 1.268 + * for the Latin-1 range (U+0000..U+00ff). 1.269 + * Must be used only if the Latin-1 range is in fact linear 1.270 + * (trie->isLatin1Linear). 1.271 + * 1.272 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.273 + * @return (const uint16_t *) pointer to values for Latin-1 code points 1.274 + */ 1.275 +#define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH) 1.276 + 1.277 +/** 1.278 + * Get a pointer to the contiguous part of the data array 1.279 + * for the Latin-1 range (U+0000..U+00ff). 1.280 + * Must be used only if the Latin-1 range is in fact linear 1.281 + * (trie->isLatin1Linear). 1.282 + * 1.283 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.284 + * @return (const uint32_t *) pointer to values for Latin-1 code points 1.285 + */ 1.286 +#define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 1.287 + 1.288 +/** 1.289 + * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff). 1.290 + * c16 may be a lead surrogate, which may have a value including a folding offset. 1.291 + * 1.292 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.293 + * @param c16 (UChar, in) the input BMP code point 1.294 + * @return (uint16_t) trie lookup result 1.295 + */ 1.296 +#define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16) 1.297 + 1.298 +/** 1.299 + * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff). 1.300 + * c16 may be a lead surrogate, which may have a value including a folding offset. 1.301 + * 1.302 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.303 + * @param c16 (UChar, in) the input BMP code point 1.304 + * @return (uint32_t) trie lookup result 1.305 + */ 1.306 +#define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16) 1.307 + 1.308 +/** 1.309 + * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff). 1.310 + * Even lead surrogate code points are treated as normal code points, 1.311 + * with unfolded values that may differ from _FROM_LEAD() macro results for them. 1.312 + * 1.313 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.314 + * @param c16 (UChar, in) the input BMP code point 1.315 + * @return (uint16_t) trie lookup result 1.316 + */ 1.317 +#define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16) 1.318 + 1.319 +/** 1.320 + * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff). 1.321 + * Even lead surrogate code points are treated as normal code points, 1.322 + * with unfolded values that may differ from _FROM_LEAD() macro results for them. 1.323 + * 1.324 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.325 + * @param c16 (UChar, in) the input BMP code point 1.326 + * @return (uint32_t) trie lookup result 1.327 + */ 1.328 +#define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16) 1.329 + 1.330 +/** 1.331 + * Get a 16-bit trie value from a code point. 1.332 + * Even lead surrogate code points are treated as normal code points, 1.333 + * with unfolded values that may differ from _FROM_LEAD() macro results for them. 1.334 + * 1.335 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.336 + * @param c32 (UChar32, in) the input code point 1.337 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result 1.338 + */ 1.339 +#define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t) 1.340 + 1.341 +/** 1.342 + * Get a 32-bit trie value from a code point. 1.343 + * Even lead surrogate code points are treated as normal code points, 1.344 + * with unfolded values that may differ from _FROM_LEAD() macro results for them. 1.345 + * 1.346 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.347 + * @param c32 (UChar32, in) the input code point 1.348 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result 1.349 + */ 1.350 +#define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t) 1.351 + 1.352 +/** 1.353 + * Get the next code point (c, c2), post-increment src, 1.354 + * and get a 16-bit value from the trie. 1.355 + * 1.356 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.357 + * @param src (const UChar *, in/out) the source text pointer 1.358 + * @param limit (const UChar *, in) the limit pointer for the text, or NULL 1.359 + * @param c (UChar, out) variable for the BMP or lead code unit 1.360 + * @param c2 (UChar, out) variable for 0 or the trail code unit 1.361 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result 1.362 + */ 1.363 +#define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t) 1.364 + 1.365 +/** 1.366 + * Get the next code point (c, c2), post-increment src, 1.367 + * and get a 32-bit value from the trie. 1.368 + * 1.369 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.370 + * @param src (const UChar *, in/out) the source text pointer 1.371 + * @param limit (const UChar *, in) the limit pointer for the text, or NULL 1.372 + * @param c (UChar, out) variable for the BMP or lead code unit 1.373 + * @param c2 (UChar, out) variable for 0 or the trail code unit 1.374 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result 1.375 + */ 1.376 +#define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t) 1.377 + 1.378 +/** 1.379 + * Get the previous code point (c, c2), pre-decrement src, 1.380 + * and get a 16-bit value from the trie. 1.381 + * 1.382 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.383 + * @param start (const UChar *, in) the start pointer for the text, or NULL 1.384 + * @param src (const UChar *, in/out) the source text pointer 1.385 + * @param c (UChar, out) variable for the BMP or lead code unit 1.386 + * @param c2 (UChar, out) variable for 0 or the trail code unit 1.387 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result 1.388 + */ 1.389 +#define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t) 1.390 + 1.391 +/** 1.392 + * Get the previous code point (c, c2), pre-decrement src, 1.393 + * and get a 32-bit value from the trie. 1.394 + * 1.395 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.396 + * @param start (const UChar *, in) the start pointer for the text, or NULL 1.397 + * @param src (const UChar *, in/out) the source text pointer 1.398 + * @param c (UChar, out) variable for the BMP or lead code unit 1.399 + * @param c2 (UChar, out) variable for 0 or the trail code unit 1.400 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result 1.401 + */ 1.402 +#define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t) 1.403 + 1.404 +/** 1.405 + * Get a 16-bit trie value from a pair of surrogates. 1.406 + * 1.407 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.408 + * @param c (UChar, in) a lead surrogate 1.409 + * @param c2 (UChar, in) a trail surrogate 1.410 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result 1.411 + */ 1.412 +#define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t) 1.413 + 1.414 +/** 1.415 + * Get a 32-bit trie value from a pair of surrogates. 1.416 + * 1.417 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.418 + * @param c (UChar, in) a lead surrogate 1.419 + * @param c2 (UChar, in) a trail surrogate 1.420 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result 1.421 + */ 1.422 +#define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t) 1.423 + 1.424 +/** 1.425 + * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate) 1.426 + * and a trail surrogate. 1.427 + * 1.428 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.429 + * @param offset (int32_t, in) the folding offset from the value of a lead surrogate 1.430 + * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant) 1.431 + * @return (uint16_t) trie lookup result 1.432 + */ 1.433 +#define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff) 1.434 + 1.435 +/** 1.436 + * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate) 1.437 + * and a trail surrogate. 1.438 + * 1.439 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure 1.440 + * @param offset (int32_t, in) the folding offset from the value of a lead surrogate 1.441 + * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant) 1.442 + * @return (uint32_t) trie lookup result 1.443 + */ 1.444 +#define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff) 1.445 + 1.446 +/* enumeration callback types */ 1.447 + 1.448 +/** 1.449 + * Callback from utrie_enum(), extracts a uint32_t value from a 1.450 + * trie value. This value will be passed on to the UTrieEnumRange function. 1.451 + * 1.452 + * @param context an opaque pointer, as passed into utrie_enum() 1.453 + * @param value a value from the trie 1.454 + * @return the value that is to be passed on to the UTrieEnumRange function 1.455 + */ 1.456 +typedef uint32_t U_CALLCONV 1.457 +UTrieEnumValue(const void *context, uint32_t value); 1.458 + 1.459 +/** 1.460 + * Callback from utrie_enum(), is called for each contiguous range 1.461 + * of code points with the same value as retrieved from the trie and 1.462 + * transformed by the UTrieEnumValue function. 1.463 + * 1.464 + * The callback function can stop the enumeration by returning FALSE. 1.465 + * 1.466 + * @param context an opaque pointer, as passed into utrie_enum() 1.467 + * @param start the first code point in a contiguous range with value 1.468 + * @param limit one past the last code point in a contiguous range with value 1.469 + * @param value the value that is set for all code points in [start..limit[ 1.470 + * @return FALSE to stop the enumeration 1.471 + */ 1.472 +typedef UBool U_CALLCONV 1.473 +UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value); 1.474 + 1.475 +/** 1.476 + * Enumerate efficiently all values in a trie. 1.477 + * For each entry in the trie, the value to be delivered is passed through 1.478 + * the UTrieEnumValue function. 1.479 + * The value is unchanged if that function pointer is NULL. 1.480 + * 1.481 + * For each contiguous range of code points with a given value, 1.482 + * the UTrieEnumRange function is called. 1.483 + * 1.484 + * @param trie a pointer to the runtime trie structure 1.485 + * @param enumValue a pointer to a function that may transform the trie entry value, 1.486 + * or NULL if the values from the trie are to be used directly 1.487 + * @param enumRange a pointer to a function that is called for each contiguous range 1.488 + * of code points with the same value 1.489 + * @param context an opaque pointer that is passed on to the callback functions 1.490 + */ 1.491 +U_CAPI void U_EXPORT2 1.492 +utrie_enum(const UTrie *trie, 1.493 + UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context); 1.494 + 1.495 +/** 1.496 + * Unserialize a trie from 32-bit-aligned memory. 1.497 + * Inverse of utrie_serialize(). 1.498 + * Fills the UTrie runtime trie structure with the settings for the trie data. 1.499 + * 1.500 + * @param trie a pointer to the runtime trie structure 1.501 + * @param data a pointer to 32-bit-aligned memory containing trie data 1.502 + * @param length the number of bytes available at data 1.503 + * @param pErrorCode an in/out ICU UErrorCode 1.504 + * @return the number of bytes at data taken up by the trie data 1.505 + */ 1.506 +U_CAPI int32_t U_EXPORT2 1.507 +utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode); 1.508 + 1.509 +/** 1.510 + * "Unserialize" a dummy trie. 1.511 + * A dummy trie is an empty runtime trie, used when a real data trie cannot 1.512 + * be loaded. 1.513 + * 1.514 + * The input memory is filled so that the trie always returns the initialValue, 1.515 + * or the leadUnitValue for lead surrogate code points. 1.516 + * The Latin-1 part is always set up to be linear. 1.517 + * 1.518 + * @param trie a pointer to the runtime trie structure 1.519 + * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data 1.520 + * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE) 1.521 + * @param initialValue the initial value that is set for all code points 1.522 + * @param leadUnitValue the value for lead surrogate code _units_ that do not 1.523 + * have associated supplementary data 1.524 + * @param pErrorCode an in/out ICU UErrorCode 1.525 + * 1.526 + * @see UTRIE_DUMMY_SIZE 1.527 + * @see utrie_open 1.528 + */ 1.529 +U_CAPI int32_t U_EXPORT2 1.530 +utrie_unserializeDummy(UTrie *trie, 1.531 + void *data, int32_t length, 1.532 + uint32_t initialValue, uint32_t leadUnitValue, 1.533 + UBool make16BitTrie, 1.534 + UErrorCode *pErrorCode); 1.535 + 1.536 +/** 1.537 + * Default implementation for UTrie.getFoldingOffset, set automatically by 1.538 + * utrie_unserialize(). 1.539 + * Simply returns the lead surrogate's value itself - which is the inverse 1.540 + * of the default folding function used by utrie_serialize(). 1.541 + * Exported for static const UTrie structures. 1.542 + * 1.543 + * @see UTrieGetFoldingOffset 1.544 + */ 1.545 +U_CAPI int32_t U_EXPORT2 1.546 +utrie_defaultGetFoldingOffset(uint32_t data); 1.547 + 1.548 +/* Building a trie ----------------------------------------------------------*/ 1.549 + 1.550 +/** 1.551 + * Build-time trie structure. 1.552 + * Opaque definition, here only to make fillIn parameters possible 1.553 + * for utrie_open() and utrie_clone(). 1.554 + */ 1.555 +struct UNewTrie { 1.556 + /** 1.557 + * Index values at build-time are 32 bits wide for easier processing. 1.558 + * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()). 1.559 + */ 1.560 + int32_t index[UTRIE_MAX_INDEX_LENGTH]; 1.561 + uint32_t *data; 1.562 + 1.563 + uint32_t leadUnitValue; 1.564 + int32_t indexLength, dataCapacity, dataLength; 1.565 + UBool isAllocated, isDataAllocated; 1.566 + UBool isLatin1Linear, isCompacted; 1.567 + 1.568 + /** 1.569 + * Map of adjusted indexes, used in utrie_compact(). 1.570 + * Maps from original indexes to new ones. 1.571 + */ 1.572 + int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT]; 1.573 +}; 1.574 + 1.575 +typedef struct UNewTrie UNewTrie; 1.576 + 1.577 +/** 1.578 + * Build-time trie callback function, used with utrie_serialize(). 1.579 + * This function calculates a lead surrogate's value including a folding offset 1.580 + * from the 1024 supplementary code points [start..start+1024[ . 1.581 + * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. 1.582 + * 1.583 + * The folding offset is provided by the caller. 1.584 + * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. 1.585 + * Instead of the offset itself, n can be stored in 10 bits - 1.586 + * or fewer if it can be assumed that few lead surrogates have associated data. 1.587 + * 1.588 + * The returned value must be 1.589 + * - not zero if and only if there is relevant data 1.590 + * for the corresponding 1024 supplementary code points 1.591 + * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset 1.592 + * 1.593 + * @return a folded value, or 0 if there is no relevant data for the lead surrogate. 1.594 + */ 1.595 +typedef uint32_t U_CALLCONV 1.596 +UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset); 1.597 + 1.598 +/** 1.599 + * Open a build-time trie structure. 1.600 + * The size of the build-time data array is specified to avoid allocating a large 1.601 + * array in all cases. The array itself can also be passed in. 1.602 + * 1.603 + * Although the trie is never fully expanded to a linear array, especially when 1.604 + * utrie_setRange32() is used, the data array could be large during build time. 1.605 + * The maximum length is 1.606 + * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400. 1.607 + * (Number of Unicode code points + one all-initial-value block + 1.608 + * possible duplicate entries for 1024 lead surrogates.) 1.609 + * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.) 1.610 + * 1.611 + * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or 1.612 + * NULL if one is to be allocated 1.613 + * @param aliasData a pointer to a data array to be used (will not be released), or 1.614 + * NULL if one is to be allocated 1.615 + * @param maxDataLength the capacity of aliasData (if not NULL) or 1.616 + * the length of the data array to be allocated 1.617 + * @param initialValue the initial value that is set for all code points 1.618 + * @param leadUnitValue the value for lead surrogate code _units_ that do not 1.619 + * have associated supplementary data 1.620 + * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and 1.621 + * kept in a linear, contiguous part of the data array 1.622 + * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie 1.623 + */ 1.624 +U_CAPI UNewTrie * U_EXPORT2 1.625 +utrie_open(UNewTrie *fillIn, 1.626 + uint32_t *aliasData, int32_t maxDataLength, 1.627 + uint32_t initialValue, uint32_t leadUnitValue, 1.628 + UBool latin1Linear); 1.629 + 1.630 +/** 1.631 + * Clone a build-time trie structure with all entries. 1.632 + * 1.633 + * @param fillIn like in utrie_open() 1.634 + * @param other the build-time trie structure to clone 1.635 + * @param aliasData like in utrie_open(), 1.636 + * used if aliasDataLength>=(capacity of other's data array) 1.637 + * @param aliasDataLength the length of aliasData 1.638 + * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie 1.639 + */ 1.640 +U_CAPI UNewTrie * U_EXPORT2 1.641 +utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength); 1.642 + 1.643 +/** 1.644 + * Close a build-time trie structure, and release memory 1.645 + * that was allocated by utrie_open() or utrie_clone(). 1.646 + * 1.647 + * @param trie the build-time trie 1.648 + */ 1.649 +U_CAPI void U_EXPORT2 1.650 +utrie_close(UNewTrie *trie); 1.651 + 1.652 +/** 1.653 + * Get the data array of a build-time trie. 1.654 + * The data may be modified, but entries that are equal before 1.655 + * must still be equal after modification. 1.656 + * 1.657 + * @param trie the build-time trie 1.658 + * @param pLength (out) a pointer to a variable that receives the number 1.659 + * of entries in the data array 1.660 + * @return the data array 1.661 + */ 1.662 +U_CAPI uint32_t * U_EXPORT2 1.663 +utrie_getData(UNewTrie *trie, int32_t *pLength); 1.664 + 1.665 +/** 1.666 + * Set a value for a code point. 1.667 + * 1.668 + * @param trie the build-time trie 1.669 + * @param c the code point 1.670 + * @param value the value 1.671 + * @return FALSE if a failure occurred (illegal argument or data array overrun) 1.672 + */ 1.673 +U_CAPI UBool U_EXPORT2 1.674 +utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value); 1.675 + 1.676 +/** 1.677 + * Get a value from a code point as stored in the build-time trie. 1.678 + * 1.679 + * @param trie the build-time trie 1.680 + * @param c the code point 1.681 + * @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE 1.682 + * iff the value is retrieved from block 0; 1.683 + * block 0 is the all-initial-value initial block 1.684 + * @return the value 1.685 + */ 1.686 +U_CAPI uint32_t U_EXPORT2 1.687 +utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero); 1.688 + 1.689 +/** 1.690 + * Set a value in a range of code points [start..limit[. 1.691 + * All code points c with start<=c<limit will get the value if 1.692 + * overwrite is TRUE or if the old value is 0. 1.693 + * 1.694 + * @param trie the build-time trie 1.695 + * @param start the first code point to get the value 1.696 + * @param limit one past the last code point to get the value 1.697 + * @param value the value 1.698 + * @param overwrite flag for whether old non-initial values are to be overwritten 1.699 + * @return FALSE if a failure occurred (illegal argument or data array overrun) 1.700 + */ 1.701 +U_CAPI UBool U_EXPORT2 1.702 +utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite); 1.703 + 1.704 +/** 1.705 + * Compact the build-time trie after all values are set, and then 1.706 + * serialize it into 32-bit aligned memory. 1.707 + * 1.708 + * After this, the trie can only be serizalized again and/or closed; 1.709 + * no further values can be added. 1.710 + * 1.711 + * @see utrie_unserialize() 1.712 + * 1.713 + * @param trie the build-time trie 1.714 + * @param data a pointer to 32-bit-aligned memory for the trie data 1.715 + * @param capacity the number of bytes available at data 1.716 + * @param getFoldedValue a callback function that calculates the value for 1.717 + * a lead surrogate from all of its supplementary code points 1.718 + * and the folding offset; 1.719 + * if NULL, then a default function is used which returns just 1.720 + * the input offset when there are any non-initial-value entries 1.721 + * @param reduceTo16Bits flag for whether the values are to be reduced to a 1.722 + * width of 16 bits for serialization and runtime 1.723 + * @param pErrorCode a UErrorCode argument; among other possible error codes: 1.724 + * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 1.725 + * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small 1.726 + * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization 1.727 + * 1.728 + * @return the number of bytes written for the trie 1.729 + */ 1.730 +U_CAPI int32_t U_EXPORT2 1.731 +utrie_serialize(UNewTrie *trie, void *data, int32_t capacity, 1.732 + UNewTrieGetFoldedValue *getFoldedValue, 1.733 + UBool reduceTo16Bits, 1.734 + UErrorCode *pErrorCode); 1.735 + 1.736 +/** 1.737 + * Swap a serialized UTrie. 1.738 + * @internal 1.739 + */ 1.740 +U_CAPI int32_t U_EXPORT2 1.741 +utrie_swap(const UDataSwapper *ds, 1.742 + const void *inData, int32_t length, void *outData, 1.743 + UErrorCode *pErrorCode); 1.744 + 1.745 +/* serialization ------------------------------------------------------------ */ 1.746 + 1.747 +/** 1.748 + * Trie data structure in serialized form: 1.749 + * 1.750 + * UTrieHeader header; 1.751 + * uint16_t index[header.indexLength]; 1.752 + * uint16_t data[header.dataLength]; 1.753 + * @internal 1.754 + */ 1.755 +typedef struct UTrieHeader { 1.756 + /** "Trie" in big-endian US-ASCII (0x54726965) */ 1.757 + uint32_t signature; 1.758 + 1.759 + /** 1.760 + * options bit field: 1.761 + * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH 1.762 + * 8 0=16-bit data, 1=32-bit data 1.763 + * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT 1.764 + * 3..0 UTRIE_SHIFT // 1..9 1.765 + */ 1.766 + uint32_t options; 1.767 + 1.768 + /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */ 1.769 + int32_t indexLength; 1.770 + 1.771 + /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */ 1.772 + int32_t dataLength; 1.773 +} UTrieHeader; 1.774 + 1.775 +/** 1.776 + * Constants for use with UTrieHeader.options. 1.777 + * @internal 1.778 + */ 1.779 +enum { 1.780 + /** Mask to get the UTRIE_SHIFT value from options. */ 1.781 + UTRIE_OPTIONS_SHIFT_MASK=0xf, 1.782 + 1.783 + /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */ 1.784 + UTRIE_OPTIONS_INDEX_SHIFT=4, 1.785 + 1.786 + /** If set, then the data (stage 2) array is 32 bits wide. */ 1.787 + UTRIE_OPTIONS_DATA_IS_32_BIT=0x100, 1.788 + 1.789 + /** 1.790 + * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array 1.791 + * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH. 1.792 + */ 1.793 + UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200 1.794 +}; 1.795 + 1.796 +U_CDECL_END 1.797 + 1.798 +#endif