intl/icu/source/common/utrie.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: utrie.h
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2001nov08
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #ifndef __UTRIE_H__
michael@0 18 #define __UTRIE_H__
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21 #include "unicode/utf16.h"
michael@0 22 #include "udataswp.h"
michael@0 23
michael@0 24 U_CDECL_BEGIN
michael@0 25
michael@0 26 /**
michael@0 27 * \file
michael@0 28 *
michael@0 29 * This is a common implementation of a "folded" trie.
michael@0 30 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
michael@0 31 * Unicode code points (0..0x10ffff).
michael@0 32 *
michael@0 33 * This implementation is optimized for getting values while walking forward
michael@0 34 * through a UTF-16 string.
michael@0 35 * Therefore, the simplest and fastest access macros are the
michael@0 36 * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
michael@0 37 *
michael@0 38 * The _FROM_BMP() macros are a little more complicated; they get values
michael@0 39 * even for lead surrogate code _points_, while the _FROM_LEAD() macros
michael@0 40 * get special "folded" values for lead surrogate code _units_ if
michael@0 41 * there is relevant data associated with them.
michael@0 42 * From such a folded value, an offset needs to be extracted to supply
michael@0 43 * to the _FROM_OFFSET_TRAIL() macros.
michael@0 44 *
michael@0 45 * Most of the more complex (and more convenient) functions/macros call a callback function
michael@0 46 * to get that offset from the folded value for a lead surrogate unit.
michael@0 47 */
michael@0 48
michael@0 49 /**
michael@0 50 * Trie constants, defining shift widths, index array lengths, etc.
michael@0 51 */
michael@0 52 enum {
michael@0 53 /** Shift size for shifting right the input index. 1..9 */
michael@0 54 UTRIE_SHIFT=5,
michael@0 55
michael@0 56 /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
michael@0 57 UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
michael@0 58
michael@0 59 /** Mask for getting the lower bits from the input index. */
michael@0 60 UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
michael@0 61
michael@0 62 /**
michael@0 63 * Lead surrogate code points' index displacement in the index array.
michael@0 64 * 0x10000-0xd800=0x2800
michael@0 65 */
michael@0 66 UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
michael@0 67
michael@0 68 /**
michael@0 69 * Shift size for shifting left the index array values.
michael@0 70 * Increases possible data size with 16-bit index values at the cost
michael@0 71 * of compactability.
michael@0 72 * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
michael@0 73 * 0..UTRIE_SHIFT
michael@0 74 */
michael@0 75 UTRIE_INDEX_SHIFT=2,
michael@0 76
michael@0 77 /** The alignment size of a stage 2 data block. Also the granularity for compaction. */
michael@0 78 UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
michael@0 79
michael@0 80 /** Number of bits of a trail surrogate that are used in index table lookups. */
michael@0 81 UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
michael@0 82
michael@0 83 /**
michael@0 84 * Number of index (stage 1) entries per lead surrogate.
michael@0 85 * Same as number of index entries for 1024 trail surrogates,
michael@0 86 * ==0x400>>UTRIE_SHIFT
michael@0 87 */
michael@0 88 UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
michael@0 89
michael@0 90 /** Length of the BMP portion of the index (stage 1) array. */
michael@0 91 UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
michael@0 92 };
michael@0 93
michael@0 94 /**
michael@0 95 * Length of the index (stage 1) array before folding.
michael@0 96 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
michael@0 97 */
michael@0 98 #define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
michael@0 99
michael@0 100 /**
michael@0 101 * Maximum length of the runtime data (stage 2) array.
michael@0 102 * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
michael@0 103 */
michael@0 104 #define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
michael@0 105
michael@0 106 /**
michael@0 107 * Maximum length of the build-time data (stage 2) array.
michael@0 108 * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
michael@0 109 * (Number of Unicode code points + one all-initial-value block +
michael@0 110 * possible duplicate entries for 1024 lead surrogates.)
michael@0 111 */
michael@0 112 #define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
michael@0 113
michael@0 114 /**
michael@0 115 * Number of bytes for a dummy trie.
michael@0 116 * A dummy trie is an empty runtime trie, used when a real data trie cannot
michael@0 117 * be loaded.
michael@0 118 * The number of bytes works for Latin-1-linear tries with 32-bit data
michael@0 119 * (worst case).
michael@0 120 *
michael@0 121 * Calculation:
michael@0 122 * BMP index + 1 index block for lead surrogate code points +
michael@0 123 * Latin-1-linear array + 1 data block for lead surrogate code points
michael@0 124 *
michael@0 125 * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block }
michael@0 126 *
michael@0 127 * @see utrie_unserializeDummy
michael@0 128 */
michael@0 129 #define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4)
michael@0 130
michael@0 131 /**
michael@0 132 * Runtime UTrie callback function.
michael@0 133 * Extract from a lead surrogate's data the
michael@0 134 * index array offset of the indexes for that lead surrogate.
michael@0 135 *
michael@0 136 * @param data data value for a surrogate from the trie, including the folding offset
michael@0 137 * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
michael@0 138 */
michael@0 139 typedef int32_t U_CALLCONV
michael@0 140 UTrieGetFoldingOffset(uint32_t data);
michael@0 141
michael@0 142 /**
michael@0 143 * Run-time Trie structure.
michael@0 144 *
michael@0 145 * Either the data table is 16 bits wide and accessed via the index
michael@0 146 * pointer, with each index item increased by indexLength;
michael@0 147 * in this case, data32==NULL.
michael@0 148 *
michael@0 149 * Or the data table is 32 bits wide and accessed via the data32 pointer.
michael@0 150 */
michael@0 151 struct UTrie {
michael@0 152 const uint16_t *index;
michael@0 153 const uint32_t *data32; /* NULL if 16b data is used via index */
michael@0 154
michael@0 155 /**
michael@0 156 * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
michael@0 157 * If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
michael@0 158 *
michael@0 159 * utrie_unserialize() sets a default function which simply returns
michael@0 160 * the lead surrogate's value itself - which is the inverse of the default
michael@0 161 * folding function used by utrie_serialize().
michael@0 162 *
michael@0 163 * @see UTrieGetFoldingOffset
michael@0 164 */
michael@0 165 UTrieGetFoldingOffset *getFoldingOffset;
michael@0 166
michael@0 167 int32_t indexLength, dataLength;
michael@0 168 uint32_t initialValue;
michael@0 169 UBool isLatin1Linear;
michael@0 170 };
michael@0 171
michael@0 172 #ifndef __UTRIE2_H__
michael@0 173 typedef struct UTrie UTrie;
michael@0 174 #endif
michael@0 175
michael@0 176 /** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
michael@0 177 #define _UTRIE_GET_RAW(trie, data, offset, c16) \
michael@0 178 (trie)->data[ \
michael@0 179 ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
michael@0 180 ((c16)&UTRIE_MASK) \
michael@0 181 ]
michael@0 182
michael@0 183 /** Internal trie getter from a pair of surrogates */
michael@0 184 #define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) { \
michael@0 185 int32_t __offset; \
michael@0 186 \
michael@0 187 /* get data for lead surrogate */ \
michael@0 188 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
michael@0 189 __offset=(trie)->getFoldingOffset(result); \
michael@0 190 \
michael@0 191 /* get the real data from the folded lead/trail units */ \
michael@0 192 if(__offset>0) { \
michael@0 193 (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
michael@0 194 } else { \
michael@0 195 (result)=(resultType)((trie)->initialValue); \
michael@0 196 } \
michael@0 197 }
michael@0 198
michael@0 199 /** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
michael@0 200 #define _UTRIE_GET_FROM_BMP(trie, data, c16) \
michael@0 201 _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16);
michael@0 202
michael@0 203 /**
michael@0 204 * Internal trie getter from a code point.
michael@0 205 * Could be faster(?) but longer with
michael@0 206 * if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
michael@0 207 */
michael@0 208 #define _UTRIE_GET(trie, data, c32, result, resultType) \
michael@0 209 if((uint32_t)(c32)<=0xffff) { \
michael@0 210 /* BMP code points */ \
michael@0 211 (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
michael@0 212 } else if((uint32_t)(c32)<=0x10ffff) { \
michael@0 213 /* supplementary code point */ \
michael@0 214 UChar __lead16=U16_LEAD(c32); \
michael@0 215 _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \
michael@0 216 } else { \
michael@0 217 /* out of range */ \
michael@0 218 (result)=(resultType)((trie)->initialValue); \
michael@0 219 }
michael@0 220
michael@0 221 /** Internal next-post-increment: get the next code point (c, c2) and its data */
michael@0 222 #define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) { \
michael@0 223 (c)=*(src)++; \
michael@0 224 if(!U16_IS_LEAD(c)) { \
michael@0 225 (c2)=0; \
michael@0 226 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
michael@0 227 } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \
michael@0 228 ++(src); \
michael@0 229 _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
michael@0 230 } else { \
michael@0 231 /* unpaired lead surrogate code point */ \
michael@0 232 (c2)=0; \
michael@0 233 (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
michael@0 234 } \
michael@0 235 }
michael@0 236
michael@0 237 /** Internal previous: get the previous code point (c, c2) and its data */
michael@0 238 #define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) { \
michael@0 239 (c)=*--(src); \
michael@0 240 if(!U16_IS_SURROGATE(c)) { \
michael@0 241 (c2)=0; \
michael@0 242 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
michael@0 243 } else if(!U16_IS_SURROGATE_LEAD(c)) { \
michael@0 244 /* trail surrogate */ \
michael@0 245 if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \
michael@0 246 --(src); \
michael@0 247 (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
michael@0 248 _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
michael@0 249 } else { \
michael@0 250 /* unpaired trail surrogate code point */ \
michael@0 251 (c2)=0; \
michael@0 252 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
michael@0 253 } \
michael@0 254 } else { \
michael@0 255 /* unpaired lead surrogate code point */ \
michael@0 256 (c2)=0; \
michael@0 257 (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
michael@0 258 } \
michael@0 259 }
michael@0 260
michael@0 261 /* Public UTrie API ---------------------------------------------------------*/
michael@0 262
michael@0 263 /**
michael@0 264 * Get a pointer to the contiguous part of the data array
michael@0 265 * for the Latin-1 range (U+0000..U+00ff).
michael@0 266 * Must be used only if the Latin-1 range is in fact linear
michael@0 267 * (trie->isLatin1Linear).
michael@0 268 *
michael@0 269 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 270 * @return (const uint16_t *) pointer to values for Latin-1 code points
michael@0 271 */
michael@0 272 #define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
michael@0 273
michael@0 274 /**
michael@0 275 * Get a pointer to the contiguous part of the data array
michael@0 276 * for the Latin-1 range (U+0000..U+00ff).
michael@0 277 * Must be used only if the Latin-1 range is in fact linear
michael@0 278 * (trie->isLatin1Linear).
michael@0 279 *
michael@0 280 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 281 * @return (const uint32_t *) pointer to values for Latin-1 code points
michael@0 282 */
michael@0 283 #define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
michael@0 284
michael@0 285 /**
michael@0 286 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
michael@0 287 * c16 may be a lead surrogate, which may have a value including a folding offset.
michael@0 288 *
michael@0 289 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 290 * @param c16 (UChar, in) the input BMP code point
michael@0 291 * @return (uint16_t) trie lookup result
michael@0 292 */
michael@0 293 #define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
michael@0 294
michael@0 295 /**
michael@0 296 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
michael@0 297 * c16 may be a lead surrogate, which may have a value including a folding offset.
michael@0 298 *
michael@0 299 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 300 * @param c16 (UChar, in) the input BMP code point
michael@0 301 * @return (uint32_t) trie lookup result
michael@0 302 */
michael@0 303 #define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
michael@0 304
michael@0 305 /**
michael@0 306 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
michael@0 307 * Even lead surrogate code points are treated as normal code points,
michael@0 308 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
michael@0 309 *
michael@0 310 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 311 * @param c16 (UChar, in) the input BMP code point
michael@0 312 * @return (uint16_t) trie lookup result
michael@0 313 */
michael@0 314 #define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
michael@0 315
michael@0 316 /**
michael@0 317 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
michael@0 318 * Even lead surrogate code points are treated as normal code points,
michael@0 319 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
michael@0 320 *
michael@0 321 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 322 * @param c16 (UChar, in) the input BMP code point
michael@0 323 * @return (uint32_t) trie lookup result
michael@0 324 */
michael@0 325 #define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
michael@0 326
michael@0 327 /**
michael@0 328 * Get a 16-bit trie value from a code point.
michael@0 329 * Even lead surrogate code points are treated as normal code points,
michael@0 330 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
michael@0 331 *
michael@0 332 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 333 * @param c32 (UChar32, in) the input code point
michael@0 334 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
michael@0 335 */
michael@0 336 #define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t)
michael@0 337
michael@0 338 /**
michael@0 339 * Get a 32-bit trie value from a code point.
michael@0 340 * Even lead surrogate code points are treated as normal code points,
michael@0 341 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
michael@0 342 *
michael@0 343 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 344 * @param c32 (UChar32, in) the input code point
michael@0 345 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
michael@0 346 */
michael@0 347 #define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t)
michael@0 348
michael@0 349 /**
michael@0 350 * Get the next code point (c, c2), post-increment src,
michael@0 351 * and get a 16-bit value from the trie.
michael@0 352 *
michael@0 353 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 354 * @param src (const UChar *, in/out) the source text pointer
michael@0 355 * @param limit (const UChar *, in) the limit pointer for the text, or NULL
michael@0 356 * @param c (UChar, out) variable for the BMP or lead code unit
michael@0 357 * @param c2 (UChar, out) variable for 0 or the trail code unit
michael@0 358 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
michael@0 359 */
michael@0 360 #define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t)
michael@0 361
michael@0 362 /**
michael@0 363 * Get the next code point (c, c2), post-increment src,
michael@0 364 * and get a 32-bit value from the trie.
michael@0 365 *
michael@0 366 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 367 * @param src (const UChar *, in/out) the source text pointer
michael@0 368 * @param limit (const UChar *, in) the limit pointer for the text, or NULL
michael@0 369 * @param c (UChar, out) variable for the BMP or lead code unit
michael@0 370 * @param c2 (UChar, out) variable for 0 or the trail code unit
michael@0 371 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
michael@0 372 */
michael@0 373 #define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t)
michael@0 374
michael@0 375 /**
michael@0 376 * Get the previous code point (c, c2), pre-decrement src,
michael@0 377 * and get a 16-bit value from the trie.
michael@0 378 *
michael@0 379 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 380 * @param start (const UChar *, in) the start pointer for the text, or NULL
michael@0 381 * @param src (const UChar *, in/out) the source text pointer
michael@0 382 * @param c (UChar, out) variable for the BMP or lead code unit
michael@0 383 * @param c2 (UChar, out) variable for 0 or the trail code unit
michael@0 384 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
michael@0 385 */
michael@0 386 #define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t)
michael@0 387
michael@0 388 /**
michael@0 389 * Get the previous code point (c, c2), pre-decrement src,
michael@0 390 * and get a 32-bit value from the trie.
michael@0 391 *
michael@0 392 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 393 * @param start (const UChar *, in) the start pointer for the text, or NULL
michael@0 394 * @param src (const UChar *, in/out) the source text pointer
michael@0 395 * @param c (UChar, out) variable for the BMP or lead code unit
michael@0 396 * @param c2 (UChar, out) variable for 0 or the trail code unit
michael@0 397 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
michael@0 398 */
michael@0 399 #define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t)
michael@0 400
michael@0 401 /**
michael@0 402 * Get a 16-bit trie value from a pair of surrogates.
michael@0 403 *
michael@0 404 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 405 * @param c (UChar, in) a lead surrogate
michael@0 406 * @param c2 (UChar, in) a trail surrogate
michael@0 407 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
michael@0 408 */
michael@0 409 #define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t)
michael@0 410
michael@0 411 /**
michael@0 412 * Get a 32-bit trie value from a pair of surrogates.
michael@0 413 *
michael@0 414 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 415 * @param c (UChar, in) a lead surrogate
michael@0 416 * @param c2 (UChar, in) a trail surrogate
michael@0 417 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
michael@0 418 */
michael@0 419 #define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t)
michael@0 420
michael@0 421 /**
michael@0 422 * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
michael@0 423 * and a trail surrogate.
michael@0 424 *
michael@0 425 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 426 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
michael@0 427 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
michael@0 428 * @return (uint16_t) trie lookup result
michael@0 429 */
michael@0 430 #define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
michael@0 431
michael@0 432 /**
michael@0 433 * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
michael@0 434 * and a trail surrogate.
michael@0 435 *
michael@0 436 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
michael@0 437 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
michael@0 438 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
michael@0 439 * @return (uint32_t) trie lookup result
michael@0 440 */
michael@0 441 #define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
michael@0 442
michael@0 443 /* enumeration callback types */
michael@0 444
michael@0 445 /**
michael@0 446 * Callback from utrie_enum(), extracts a uint32_t value from a
michael@0 447 * trie value. This value will be passed on to the UTrieEnumRange function.
michael@0 448 *
michael@0 449 * @param context an opaque pointer, as passed into utrie_enum()
michael@0 450 * @param value a value from the trie
michael@0 451 * @return the value that is to be passed on to the UTrieEnumRange function
michael@0 452 */
michael@0 453 typedef uint32_t U_CALLCONV
michael@0 454 UTrieEnumValue(const void *context, uint32_t value);
michael@0 455
michael@0 456 /**
michael@0 457 * Callback from utrie_enum(), is called for each contiguous range
michael@0 458 * of code points with the same value as retrieved from the trie and
michael@0 459 * transformed by the UTrieEnumValue function.
michael@0 460 *
michael@0 461 * The callback function can stop the enumeration by returning FALSE.
michael@0 462 *
michael@0 463 * @param context an opaque pointer, as passed into utrie_enum()
michael@0 464 * @param start the first code point in a contiguous range with value
michael@0 465 * @param limit one past the last code point in a contiguous range with value
michael@0 466 * @param value the value that is set for all code points in [start..limit[
michael@0 467 * @return FALSE to stop the enumeration
michael@0 468 */
michael@0 469 typedef UBool U_CALLCONV
michael@0 470 UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
michael@0 471
michael@0 472 /**
michael@0 473 * Enumerate efficiently all values in a trie.
michael@0 474 * For each entry in the trie, the value to be delivered is passed through
michael@0 475 * the UTrieEnumValue function.
michael@0 476 * The value is unchanged if that function pointer is NULL.
michael@0 477 *
michael@0 478 * For each contiguous range of code points with a given value,
michael@0 479 * the UTrieEnumRange function is called.
michael@0 480 *
michael@0 481 * @param trie a pointer to the runtime trie structure
michael@0 482 * @param enumValue a pointer to a function that may transform the trie entry value,
michael@0 483 * or NULL if the values from the trie are to be used directly
michael@0 484 * @param enumRange a pointer to a function that is called for each contiguous range
michael@0 485 * of code points with the same value
michael@0 486 * @param context an opaque pointer that is passed on to the callback functions
michael@0 487 */
michael@0 488 U_CAPI void U_EXPORT2
michael@0 489 utrie_enum(const UTrie *trie,
michael@0 490 UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
michael@0 491
michael@0 492 /**
michael@0 493 * Unserialize a trie from 32-bit-aligned memory.
michael@0 494 * Inverse of utrie_serialize().
michael@0 495 * Fills the UTrie runtime trie structure with the settings for the trie data.
michael@0 496 *
michael@0 497 * @param trie a pointer to the runtime trie structure
michael@0 498 * @param data a pointer to 32-bit-aligned memory containing trie data
michael@0 499 * @param length the number of bytes available at data
michael@0 500 * @param pErrorCode an in/out ICU UErrorCode
michael@0 501 * @return the number of bytes at data taken up by the trie data
michael@0 502 */
michael@0 503 U_CAPI int32_t U_EXPORT2
michael@0 504 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode);
michael@0 505
michael@0 506 /**
michael@0 507 * "Unserialize" a dummy trie.
michael@0 508 * A dummy trie is an empty runtime trie, used when a real data trie cannot
michael@0 509 * be loaded.
michael@0 510 *
michael@0 511 * The input memory is filled so that the trie always returns the initialValue,
michael@0 512 * or the leadUnitValue for lead surrogate code points.
michael@0 513 * The Latin-1 part is always set up to be linear.
michael@0 514 *
michael@0 515 * @param trie a pointer to the runtime trie structure
michael@0 516 * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data
michael@0 517 * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE)
michael@0 518 * @param initialValue the initial value that is set for all code points
michael@0 519 * @param leadUnitValue the value for lead surrogate code _units_ that do not
michael@0 520 * have associated supplementary data
michael@0 521 * @param pErrorCode an in/out ICU UErrorCode
michael@0 522 *
michael@0 523 * @see UTRIE_DUMMY_SIZE
michael@0 524 * @see utrie_open
michael@0 525 */
michael@0 526 U_CAPI int32_t U_EXPORT2
michael@0 527 utrie_unserializeDummy(UTrie *trie,
michael@0 528 void *data, int32_t length,
michael@0 529 uint32_t initialValue, uint32_t leadUnitValue,
michael@0 530 UBool make16BitTrie,
michael@0 531 UErrorCode *pErrorCode);
michael@0 532
michael@0 533 /**
michael@0 534 * Default implementation for UTrie.getFoldingOffset, set automatically by
michael@0 535 * utrie_unserialize().
michael@0 536 * Simply returns the lead surrogate's value itself - which is the inverse
michael@0 537 * of the default folding function used by utrie_serialize().
michael@0 538 * Exported for static const UTrie structures.
michael@0 539 *
michael@0 540 * @see UTrieGetFoldingOffset
michael@0 541 */
michael@0 542 U_CAPI int32_t U_EXPORT2
michael@0 543 utrie_defaultGetFoldingOffset(uint32_t data);
michael@0 544
michael@0 545 /* Building a trie ----------------------------------------------------------*/
michael@0 546
michael@0 547 /**
michael@0 548 * Build-time trie structure.
michael@0 549 * Opaque definition, here only to make fillIn parameters possible
michael@0 550 * for utrie_open() and utrie_clone().
michael@0 551 */
michael@0 552 struct UNewTrie {
michael@0 553 /**
michael@0 554 * Index values at build-time are 32 bits wide for easier processing.
michael@0 555 * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
michael@0 556 */
michael@0 557 int32_t index[UTRIE_MAX_INDEX_LENGTH];
michael@0 558 uint32_t *data;
michael@0 559
michael@0 560 uint32_t leadUnitValue;
michael@0 561 int32_t indexLength, dataCapacity, dataLength;
michael@0 562 UBool isAllocated, isDataAllocated;
michael@0 563 UBool isLatin1Linear, isCompacted;
michael@0 564
michael@0 565 /**
michael@0 566 * Map of adjusted indexes, used in utrie_compact().
michael@0 567 * Maps from original indexes to new ones.
michael@0 568 */
michael@0 569 int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
michael@0 570 };
michael@0 571
michael@0 572 typedef struct UNewTrie UNewTrie;
michael@0 573
michael@0 574 /**
michael@0 575 * Build-time trie callback function, used with utrie_serialize().
michael@0 576 * This function calculates a lead surrogate's value including a folding offset
michael@0 577 * from the 1024 supplementary code points [start..start+1024[ .
michael@0 578 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
michael@0 579 *
michael@0 580 * The folding offset is provided by the caller.
michael@0 581 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
michael@0 582 * Instead of the offset itself, n can be stored in 10 bits -
michael@0 583 * or fewer if it can be assumed that few lead surrogates have associated data.
michael@0 584 *
michael@0 585 * The returned value must be
michael@0 586 * - not zero if and only if there is relevant data
michael@0 587 * for the corresponding 1024 supplementary code points
michael@0 588 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
michael@0 589 *
michael@0 590 * @return a folded value, or 0 if there is no relevant data for the lead surrogate.
michael@0 591 */
michael@0 592 typedef uint32_t U_CALLCONV
michael@0 593 UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
michael@0 594
michael@0 595 /**
michael@0 596 * Open a build-time trie structure.
michael@0 597 * The size of the build-time data array is specified to avoid allocating a large
michael@0 598 * array in all cases. The array itself can also be passed in.
michael@0 599 *
michael@0 600 * Although the trie is never fully expanded to a linear array, especially when
michael@0 601 * utrie_setRange32() is used, the data array could be large during build time.
michael@0 602 * The maximum length is
michael@0 603 * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
michael@0 604 * (Number of Unicode code points + one all-initial-value block +
michael@0 605 * possible duplicate entries for 1024 lead surrogates.)
michael@0 606 * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
michael@0 607 *
michael@0 608 * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
michael@0 609 * NULL if one is to be allocated
michael@0 610 * @param aliasData a pointer to a data array to be used (will not be released), or
michael@0 611 * NULL if one is to be allocated
michael@0 612 * @param maxDataLength the capacity of aliasData (if not NULL) or
michael@0 613 * the length of the data array to be allocated
michael@0 614 * @param initialValue the initial value that is set for all code points
michael@0 615 * @param leadUnitValue the value for lead surrogate code _units_ that do not
michael@0 616 * have associated supplementary data
michael@0 617 * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
michael@0 618 * kept in a linear, contiguous part of the data array
michael@0 619 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
michael@0 620 */
michael@0 621 U_CAPI UNewTrie * U_EXPORT2
michael@0 622 utrie_open(UNewTrie *fillIn,
michael@0 623 uint32_t *aliasData, int32_t maxDataLength,
michael@0 624 uint32_t initialValue, uint32_t leadUnitValue,
michael@0 625 UBool latin1Linear);
michael@0 626
michael@0 627 /**
michael@0 628 * Clone a build-time trie structure with all entries.
michael@0 629 *
michael@0 630 * @param fillIn like in utrie_open()
michael@0 631 * @param other the build-time trie structure to clone
michael@0 632 * @param aliasData like in utrie_open(),
michael@0 633 * used if aliasDataLength>=(capacity of other's data array)
michael@0 634 * @param aliasDataLength the length of aliasData
michael@0 635 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
michael@0 636 */
michael@0 637 U_CAPI UNewTrie * U_EXPORT2
michael@0 638 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
michael@0 639
michael@0 640 /**
michael@0 641 * Close a build-time trie structure, and release memory
michael@0 642 * that was allocated by utrie_open() or utrie_clone().
michael@0 643 *
michael@0 644 * @param trie the build-time trie
michael@0 645 */
michael@0 646 U_CAPI void U_EXPORT2
michael@0 647 utrie_close(UNewTrie *trie);
michael@0 648
michael@0 649 /**
michael@0 650 * Get the data array of a build-time trie.
michael@0 651 * The data may be modified, but entries that are equal before
michael@0 652 * must still be equal after modification.
michael@0 653 *
michael@0 654 * @param trie the build-time trie
michael@0 655 * @param pLength (out) a pointer to a variable that receives the number
michael@0 656 * of entries in the data array
michael@0 657 * @return the data array
michael@0 658 */
michael@0 659 U_CAPI uint32_t * U_EXPORT2
michael@0 660 utrie_getData(UNewTrie *trie, int32_t *pLength);
michael@0 661
michael@0 662 /**
michael@0 663 * Set a value for a code point.
michael@0 664 *
michael@0 665 * @param trie the build-time trie
michael@0 666 * @param c the code point
michael@0 667 * @param value the value
michael@0 668 * @return FALSE if a failure occurred (illegal argument or data array overrun)
michael@0 669 */
michael@0 670 U_CAPI UBool U_EXPORT2
michael@0 671 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
michael@0 672
michael@0 673 /**
michael@0 674 * Get a value from a code point as stored in the build-time trie.
michael@0 675 *
michael@0 676 * @param trie the build-time trie
michael@0 677 * @param c the code point
michael@0 678 * @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE
michael@0 679 * iff the value is retrieved from block 0;
michael@0 680 * block 0 is the all-initial-value initial block
michael@0 681 * @return the value
michael@0 682 */
michael@0 683 U_CAPI uint32_t U_EXPORT2
michael@0 684 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
michael@0 685
michael@0 686 /**
michael@0 687 * Set a value in a range of code points [start..limit[.
michael@0 688 * All code points c with start<=c<limit will get the value if
michael@0 689 * overwrite is TRUE or if the old value is 0.
michael@0 690 *
michael@0 691 * @param trie the build-time trie
michael@0 692 * @param start the first code point to get the value
michael@0 693 * @param limit one past the last code point to get the value
michael@0 694 * @param value the value
michael@0 695 * @param overwrite flag for whether old non-initial values are to be overwritten
michael@0 696 * @return FALSE if a failure occurred (illegal argument or data array overrun)
michael@0 697 */
michael@0 698 U_CAPI UBool U_EXPORT2
michael@0 699 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
michael@0 700
michael@0 701 /**
michael@0 702 * Compact the build-time trie after all values are set, and then
michael@0 703 * serialize it into 32-bit aligned memory.
michael@0 704 *
michael@0 705 * After this, the trie can only be serizalized again and/or closed;
michael@0 706 * no further values can be added.
michael@0 707 *
michael@0 708 * @see utrie_unserialize()
michael@0 709 *
michael@0 710 * @param trie the build-time trie
michael@0 711 * @param data a pointer to 32-bit-aligned memory for the trie data
michael@0 712 * @param capacity the number of bytes available at data
michael@0 713 * @param getFoldedValue a callback function that calculates the value for
michael@0 714 * a lead surrogate from all of its supplementary code points
michael@0 715 * and the folding offset;
michael@0 716 * if NULL, then a default function is used which returns just
michael@0 717 * the input offset when there are any non-initial-value entries
michael@0 718 * @param reduceTo16Bits flag for whether the values are to be reduced to a
michael@0 719 * width of 16 bits for serialization and runtime
michael@0 720 * @param pErrorCode a UErrorCode argument; among other possible error codes:
michael@0 721 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
michael@0 722 * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
michael@0 723 * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
michael@0 724 *
michael@0 725 * @return the number of bytes written for the trie
michael@0 726 */
michael@0 727 U_CAPI int32_t U_EXPORT2
michael@0 728 utrie_serialize(UNewTrie *trie, void *data, int32_t capacity,
michael@0 729 UNewTrieGetFoldedValue *getFoldedValue,
michael@0 730 UBool reduceTo16Bits,
michael@0 731 UErrorCode *pErrorCode);
michael@0 732
michael@0 733 /**
michael@0 734 * Swap a serialized UTrie.
michael@0 735 * @internal
michael@0 736 */
michael@0 737 U_CAPI int32_t U_EXPORT2
michael@0 738 utrie_swap(const UDataSwapper *ds,
michael@0 739 const void *inData, int32_t length, void *outData,
michael@0 740 UErrorCode *pErrorCode);
michael@0 741
michael@0 742 /* serialization ------------------------------------------------------------ */
michael@0 743
michael@0 744 /**
michael@0 745 * Trie data structure in serialized form:
michael@0 746 *
michael@0 747 * UTrieHeader header;
michael@0 748 * uint16_t index[header.indexLength];
michael@0 749 * uint16_t data[header.dataLength];
michael@0 750 * @internal
michael@0 751 */
michael@0 752 typedef struct UTrieHeader {
michael@0 753 /** "Trie" in big-endian US-ASCII (0x54726965) */
michael@0 754 uint32_t signature;
michael@0 755
michael@0 756 /**
michael@0 757 * options bit field:
michael@0 758 * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
michael@0 759 * 8 0=16-bit data, 1=32-bit data
michael@0 760 * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
michael@0 761 * 3..0 UTRIE_SHIFT // 1..9
michael@0 762 */
michael@0 763 uint32_t options;
michael@0 764
michael@0 765 /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
michael@0 766 int32_t indexLength;
michael@0 767
michael@0 768 /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
michael@0 769 int32_t dataLength;
michael@0 770 } UTrieHeader;
michael@0 771
michael@0 772 /**
michael@0 773 * Constants for use with UTrieHeader.options.
michael@0 774 * @internal
michael@0 775 */
michael@0 776 enum {
michael@0 777 /** Mask to get the UTRIE_SHIFT value from options. */
michael@0 778 UTRIE_OPTIONS_SHIFT_MASK=0xf,
michael@0 779
michael@0 780 /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
michael@0 781 UTRIE_OPTIONS_INDEX_SHIFT=4,
michael@0 782
michael@0 783 /** If set, then the data (stage 2) array is 32 bits wide. */
michael@0 784 UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
michael@0 785
michael@0 786 /**
michael@0 787 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
michael@0 788 * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
michael@0 789 */
michael@0 790 UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
michael@0 791 };
michael@0 792
michael@0 793 U_CDECL_END
michael@0 794
michael@0 795 #endif

mercurial