intl/icu/source/common/utrie.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/utrie.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,795 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2001-2011, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  utrie.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2001nov08
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#ifndef __UTRIE_H__
    1.21 +#define __UTRIE_H__
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +#include "unicode/utf16.h"
    1.25 +#include "udataswp.h"
    1.26 +
    1.27 +U_CDECL_BEGIN
    1.28 +
    1.29 +/**
    1.30 + * \file
    1.31 + *
    1.32 + * This is a common implementation of a "folded" trie.
    1.33 + * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
    1.34 + * Unicode code points (0..0x10ffff).
    1.35 + *
    1.36 + * This implementation is optimized for getting values while walking forward
    1.37 + * through a UTF-16 string.
    1.38 + * Therefore, the simplest and fastest access macros are the
    1.39 + * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
    1.40 + *
    1.41 + * The _FROM_BMP() macros are a little more complicated; they get values
    1.42 + * even for lead surrogate code _points_, while the _FROM_LEAD() macros
    1.43 + * get special "folded" values for lead surrogate code _units_ if
    1.44 + * there is relevant data associated with them.
    1.45 + * From such a folded value, an offset needs to be extracted to supply
    1.46 + * to the _FROM_OFFSET_TRAIL() macros.
    1.47 + *
    1.48 + * Most of the more complex (and more convenient) functions/macros call a callback function
    1.49 + * to get that offset from the folded value for a lead surrogate unit.
    1.50 + */
    1.51 +
    1.52 +/**
    1.53 + * Trie constants, defining shift widths, index array lengths, etc.
    1.54 + */
    1.55 +enum {
    1.56 +    /** Shift size for shifting right the input index. 1..9 */
    1.57 +    UTRIE_SHIFT=5,
    1.58 +
    1.59 +    /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
    1.60 +    UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
    1.61 +
    1.62 +    /** Mask for getting the lower bits from the input index. */
    1.63 +    UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
    1.64 +
    1.65 +    /**
    1.66 +     * Lead surrogate code points' index displacement in the index array.
    1.67 +     * 0x10000-0xd800=0x2800
    1.68 +     */
    1.69 +    UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
    1.70 +
    1.71 +    /**
    1.72 +     * Shift size for shifting left the index array values.
    1.73 +     * Increases possible data size with 16-bit index values at the cost
    1.74 +     * of compactability.
    1.75 +     * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
    1.76 +     * 0..UTRIE_SHIFT
    1.77 +     */
    1.78 +    UTRIE_INDEX_SHIFT=2,
    1.79 +
    1.80 +    /** The alignment size of a stage 2 data block. Also the granularity for compaction. */
    1.81 +    UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
    1.82 +
    1.83 +    /** Number of bits of a trail surrogate that are used in index table lookups. */
    1.84 +    UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
    1.85 +
    1.86 +    /**
    1.87 +     * Number of index (stage 1) entries per lead surrogate.
    1.88 +     * Same as number of index entries for 1024 trail surrogates,
    1.89 +     * ==0x400>>UTRIE_SHIFT
    1.90 +     */
    1.91 +    UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
    1.92 +
    1.93 +    /** Length of the BMP portion of the index (stage 1) array. */
    1.94 +    UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
    1.95 +};
    1.96 +
    1.97 +/**
    1.98 + * Length of the index (stage 1) array before folding.
    1.99 + * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
   1.100 + */
   1.101 +#define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
   1.102 +
   1.103 +/**
   1.104 + * Maximum length of the runtime data (stage 2) array.
   1.105 + * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
   1.106 + */
   1.107 +#define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
   1.108 +
   1.109 +/**
   1.110 + * Maximum length of the build-time data (stage 2) array.
   1.111 + * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
   1.112 + * (Number of Unicode code points + one all-initial-value block +
   1.113 + *  possible duplicate entries for 1024 lead surrogates.)
   1.114 + */
   1.115 +#define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
   1.116 +
   1.117 +/**
   1.118 + * Number of bytes for a dummy trie.
   1.119 + * A dummy trie is an empty runtime trie, used when a real data trie cannot
   1.120 + * be loaded.
   1.121 + * The number of bytes works for Latin-1-linear tries with 32-bit data
   1.122 + * (worst case).
   1.123 + *
   1.124 + * Calculation:
   1.125 + *   BMP index + 1 index block for lead surrogate code points +
   1.126 + *   Latin-1-linear array + 1 data block for lead surrogate code points
   1.127 + *
   1.128 + * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block }
   1.129 + *
   1.130 + * @see utrie_unserializeDummy
   1.131 + */
   1.132 +#define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4)
   1.133 +
   1.134 +/**
   1.135 + * Runtime UTrie callback function.
   1.136 + * Extract from a lead surrogate's data the
   1.137 + * index array offset of the indexes for that lead surrogate.
   1.138 + *
   1.139 + * @param data data value for a surrogate from the trie, including the folding offset
   1.140 + * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
   1.141 + */
   1.142 +typedef int32_t U_CALLCONV
   1.143 +UTrieGetFoldingOffset(uint32_t data);
   1.144 +
   1.145 +/**
   1.146 + * Run-time Trie structure.
   1.147 + *
   1.148 + * Either the data table is 16 bits wide and accessed via the index
   1.149 + * pointer, with each index item increased by indexLength;
   1.150 + * in this case, data32==NULL.
   1.151 + *
   1.152 + * Or the data table is 32 bits wide and accessed via the data32 pointer.
   1.153 + */
   1.154 +struct UTrie {
   1.155 +    const uint16_t *index;
   1.156 +    const uint32_t *data32; /* NULL if 16b data is used via index */
   1.157 +
   1.158 +    /**
   1.159 +     * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
   1.160 +     * If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
   1.161 +     *
   1.162 +     * utrie_unserialize() sets a default function which simply returns
   1.163 +     * the lead surrogate's value itself - which is the inverse of the default
   1.164 +     * folding function used by utrie_serialize().
   1.165 +     *
   1.166 +     * @see UTrieGetFoldingOffset
   1.167 +     */
   1.168 +    UTrieGetFoldingOffset *getFoldingOffset;
   1.169 +
   1.170 +    int32_t indexLength, dataLength;
   1.171 +    uint32_t initialValue;
   1.172 +    UBool isLatin1Linear;
   1.173 +};
   1.174 +
   1.175 +#ifndef __UTRIE2_H__
   1.176 +typedef struct UTrie UTrie;
   1.177 +#endif
   1.178 +
   1.179 +/** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
   1.180 +#define _UTRIE_GET_RAW(trie, data, offset, c16) \
   1.181 +    (trie)->data[ \
   1.182 +        ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
   1.183 +        ((c16)&UTRIE_MASK) \
   1.184 +    ]
   1.185 +
   1.186 +/** Internal trie getter from a pair of surrogates */
   1.187 +#define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) { \
   1.188 +    int32_t __offset; \
   1.189 +\
   1.190 +    /* get data for lead surrogate */ \
   1.191 +    (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
   1.192 +    __offset=(trie)->getFoldingOffset(result); \
   1.193 +\
   1.194 +    /* get the real data from the folded lead/trail units */ \
   1.195 +    if(__offset>0) { \
   1.196 +        (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
   1.197 +    } else { \
   1.198 +        (result)=(resultType)((trie)->initialValue); \
   1.199 +    } \
   1.200 +}
   1.201 +
   1.202 +/** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
   1.203 +#define _UTRIE_GET_FROM_BMP(trie, data, c16) \
   1.204 +    _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16);
   1.205 +
   1.206 +/**
   1.207 + * Internal trie getter from a code point.
   1.208 + * Could be faster(?) but longer with
   1.209 + *   if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
   1.210 + */
   1.211 +#define _UTRIE_GET(trie, data, c32, result, resultType) \
   1.212 +    if((uint32_t)(c32)<=0xffff) { \
   1.213 +        /* BMP code points */ \
   1.214 +        (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
   1.215 +    } else if((uint32_t)(c32)<=0x10ffff) { \
   1.216 +        /* supplementary code point */ \
   1.217 +        UChar __lead16=U16_LEAD(c32); \
   1.218 +        _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \
   1.219 +    } else { \
   1.220 +        /* out of range */ \
   1.221 +        (result)=(resultType)((trie)->initialValue); \
   1.222 +    }
   1.223 +
   1.224 +/** Internal next-post-increment: get the next code point (c, c2) and its data */
   1.225 +#define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) { \
   1.226 +    (c)=*(src)++; \
   1.227 +    if(!U16_IS_LEAD(c)) { \
   1.228 +        (c2)=0; \
   1.229 +        (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
   1.230 +    } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \
   1.231 +        ++(src); \
   1.232 +        _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
   1.233 +    } else { \
   1.234 +        /* unpaired lead surrogate code point */ \
   1.235 +        (c2)=0; \
   1.236 +        (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
   1.237 +    } \
   1.238 +}
   1.239 +
   1.240 +/** Internal previous: get the previous code point (c, c2) and its data */
   1.241 +#define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) { \
   1.242 +    (c)=*--(src); \
   1.243 +    if(!U16_IS_SURROGATE(c)) { \
   1.244 +        (c2)=0; \
   1.245 +        (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
   1.246 +    } else if(!U16_IS_SURROGATE_LEAD(c)) { \
   1.247 +        /* trail surrogate */ \
   1.248 +        if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \
   1.249 +            --(src); \
   1.250 +            (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
   1.251 +            _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
   1.252 +        } else { \
   1.253 +            /* unpaired trail surrogate code point */ \
   1.254 +            (c2)=0; \
   1.255 +            (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
   1.256 +        } \
   1.257 +    } else { \
   1.258 +        /* unpaired lead surrogate code point */ \
   1.259 +        (c2)=0; \
   1.260 +        (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
   1.261 +    } \
   1.262 +}
   1.263 +
   1.264 +/* Public UTrie API ---------------------------------------------------------*/
   1.265 +
   1.266 +/**
   1.267 + * Get a pointer to the contiguous part of the data array
   1.268 + * for the Latin-1 range (U+0000..U+00ff).
   1.269 + * Must be used only if the Latin-1 range is in fact linear
   1.270 + * (trie->isLatin1Linear).
   1.271 + *
   1.272 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.273 + * @return (const uint16_t *) pointer to values for Latin-1 code points
   1.274 + */
   1.275 +#define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
   1.276 +
   1.277 +/**
   1.278 + * Get a pointer to the contiguous part of the data array
   1.279 + * for the Latin-1 range (U+0000..U+00ff).
   1.280 + * Must be used only if the Latin-1 range is in fact linear
   1.281 + * (trie->isLatin1Linear).
   1.282 + *
   1.283 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.284 + * @return (const uint32_t *) pointer to values for Latin-1 code points
   1.285 + */
   1.286 +#define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
   1.287 +
   1.288 +/**
   1.289 + * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
   1.290 + * c16 may be a lead surrogate, which may have a value including a folding offset.
   1.291 + *
   1.292 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.293 + * @param c16 (UChar, in) the input BMP code point
   1.294 + * @return (uint16_t) trie lookup result
   1.295 + */
   1.296 +#define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
   1.297 +
   1.298 +/**
   1.299 + * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
   1.300 + * c16 may be a lead surrogate, which may have a value including a folding offset.
   1.301 + *
   1.302 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.303 + * @param c16 (UChar, in) the input BMP code point
   1.304 + * @return (uint32_t) trie lookup result
   1.305 + */
   1.306 +#define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
   1.307 +
   1.308 +/**
   1.309 + * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
   1.310 + * Even lead surrogate code points are treated as normal code points,
   1.311 + * with unfolded values that may differ from _FROM_LEAD() macro results for them.
   1.312 + *
   1.313 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.314 + * @param c16 (UChar, in) the input BMP code point
   1.315 + * @return (uint16_t) trie lookup result
   1.316 + */
   1.317 +#define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
   1.318 +
   1.319 +/**
   1.320 + * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
   1.321 + * Even lead surrogate code points are treated as normal code points,
   1.322 + * with unfolded values that may differ from _FROM_LEAD() macro results for them.
   1.323 + *
   1.324 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.325 + * @param c16 (UChar, in) the input BMP code point
   1.326 + * @return (uint32_t) trie lookup result
   1.327 + */
   1.328 +#define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
   1.329 +
   1.330 +/**
   1.331 + * Get a 16-bit trie value from a code point.
   1.332 + * Even lead surrogate code points are treated as normal code points,
   1.333 + * with unfolded values that may differ from _FROM_LEAD() macro results for them.
   1.334 + *
   1.335 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.336 + * @param c32 (UChar32, in) the input code point
   1.337 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result
   1.338 + */
   1.339 +#define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t)
   1.340 +
   1.341 +/**
   1.342 + * Get a 32-bit trie value from a code point.
   1.343 + * Even lead surrogate code points are treated as normal code points,
   1.344 + * with unfolded values that may differ from _FROM_LEAD() macro results for them.
   1.345 + *
   1.346 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.347 + * @param c32 (UChar32, in) the input code point
   1.348 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result
   1.349 + */
   1.350 +#define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t)
   1.351 +
   1.352 +/**
   1.353 + * Get the next code point (c, c2), post-increment src,
   1.354 + * and get a 16-bit value from the trie.
   1.355 + *
   1.356 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.357 + * @param src (const UChar *, in/out) the source text pointer
   1.358 + * @param limit (const UChar *, in) the limit pointer for the text, or NULL
   1.359 + * @param c (UChar, out) variable for the BMP or lead code unit
   1.360 + * @param c2 (UChar, out) variable for 0 or the trail code unit
   1.361 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result
   1.362 + */
   1.363 +#define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t)
   1.364 +
   1.365 +/**
   1.366 + * Get the next code point (c, c2), post-increment src,
   1.367 + * and get a 32-bit value from the trie.
   1.368 + *
   1.369 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.370 + * @param src (const UChar *, in/out) the source text pointer
   1.371 + * @param limit (const UChar *, in) the limit pointer for the text, or NULL
   1.372 + * @param c (UChar, out) variable for the BMP or lead code unit
   1.373 + * @param c2 (UChar, out) variable for 0 or the trail code unit
   1.374 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result
   1.375 + */
   1.376 +#define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t)
   1.377 +
   1.378 +/**
   1.379 + * Get the previous code point (c, c2), pre-decrement src,
   1.380 + * and get a 16-bit value from the trie.
   1.381 + *
   1.382 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.383 + * @param start (const UChar *, in) the start pointer for the text, or NULL
   1.384 + * @param src (const UChar *, in/out) the source text pointer
   1.385 + * @param c (UChar, out) variable for the BMP or lead code unit
   1.386 + * @param c2 (UChar, out) variable for 0 or the trail code unit
   1.387 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result
   1.388 + */
   1.389 +#define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t)
   1.390 +
   1.391 +/**
   1.392 + * Get the previous code point (c, c2), pre-decrement src,
   1.393 + * and get a 32-bit value from the trie.
   1.394 + *
   1.395 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.396 + * @param start (const UChar *, in) the start pointer for the text, or NULL
   1.397 + * @param src (const UChar *, in/out) the source text pointer
   1.398 + * @param c (UChar, out) variable for the BMP or lead code unit
   1.399 + * @param c2 (UChar, out) variable for 0 or the trail code unit
   1.400 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result
   1.401 + */
   1.402 +#define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t)
   1.403 +
   1.404 +/**
   1.405 + * Get a 16-bit trie value from a pair of surrogates.
   1.406 + *
   1.407 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.408 + * @param c (UChar, in) a lead surrogate
   1.409 + * @param c2 (UChar, in) a trail surrogate
   1.410 + * @param result (uint16_t, out) uint16_t variable for the trie lookup result
   1.411 + */
   1.412 +#define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t)
   1.413 +
   1.414 +/**
   1.415 + * Get a 32-bit trie value from a pair of surrogates.
   1.416 + *
   1.417 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.418 + * @param c (UChar, in) a lead surrogate
   1.419 + * @param c2 (UChar, in) a trail surrogate
   1.420 + * @param result (uint32_t, out) uint32_t variable for the trie lookup result
   1.421 + */
   1.422 +#define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t)
   1.423 +
   1.424 +/**
   1.425 + * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
   1.426 + * and a trail surrogate.
   1.427 + *
   1.428 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.429 + * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
   1.430 + * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
   1.431 + * @return (uint16_t) trie lookup result
   1.432 + */
   1.433 +#define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
   1.434 +
   1.435 +/**
   1.436 + * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
   1.437 + * and a trail surrogate.
   1.438 + *
   1.439 + * @param trie (const UTrie *, in) a pointer to the runtime trie structure
   1.440 + * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
   1.441 + * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
   1.442 + * @return (uint32_t) trie lookup result
   1.443 + */
   1.444 +#define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
   1.445 +
   1.446 +/* enumeration callback types */
   1.447 +
   1.448 +/**
   1.449 + * Callback from utrie_enum(), extracts a uint32_t value from a
   1.450 + * trie value. This value will be passed on to the UTrieEnumRange function.
   1.451 + *
   1.452 + * @param context an opaque pointer, as passed into utrie_enum()
   1.453 + * @param value a value from the trie
   1.454 + * @return the value that is to be passed on to the UTrieEnumRange function
   1.455 + */
   1.456 +typedef uint32_t U_CALLCONV
   1.457 +UTrieEnumValue(const void *context, uint32_t value);
   1.458 +
   1.459 +/**
   1.460 + * Callback from utrie_enum(), is called for each contiguous range
   1.461 + * of code points with the same value as retrieved from the trie and
   1.462 + * transformed by the UTrieEnumValue function.
   1.463 + *
   1.464 + * The callback function can stop the enumeration by returning FALSE.
   1.465 + *
   1.466 + * @param context an opaque pointer, as passed into utrie_enum()
   1.467 + * @param start the first code point in a contiguous range with value
   1.468 + * @param limit one past the last code point in a contiguous range with value
   1.469 + * @param value the value that is set for all code points in [start..limit[
   1.470 + * @return FALSE to stop the enumeration
   1.471 + */
   1.472 +typedef UBool U_CALLCONV
   1.473 +UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
   1.474 +
   1.475 +/**
   1.476 + * Enumerate efficiently all values in a trie.
   1.477 + * For each entry in the trie, the value to be delivered is passed through
   1.478 + * the UTrieEnumValue function.
   1.479 + * The value is unchanged if that function pointer is NULL.
   1.480 + *
   1.481 + * For each contiguous range of code points with a given value,
   1.482 + * the UTrieEnumRange function is called.
   1.483 + *
   1.484 + * @param trie a pointer to the runtime trie structure
   1.485 + * @param enumValue a pointer to a function that may transform the trie entry value,
   1.486 + *                  or NULL if the values from the trie are to be used directly
   1.487 + * @param enumRange a pointer to a function that is called for each contiguous range
   1.488 + *                  of code points with the same value
   1.489 + * @param context an opaque pointer that is passed on to the callback functions
   1.490 + */
   1.491 +U_CAPI void U_EXPORT2
   1.492 +utrie_enum(const UTrie *trie,
   1.493 +           UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
   1.494 +
   1.495 +/**
   1.496 + * Unserialize a trie from 32-bit-aligned memory.
   1.497 + * Inverse of utrie_serialize().
   1.498 + * Fills the UTrie runtime trie structure with the settings for the trie data.
   1.499 + *
   1.500 + * @param trie a pointer to the runtime trie structure
   1.501 + * @param data a pointer to 32-bit-aligned memory containing trie data
   1.502 + * @param length the number of bytes available at data
   1.503 + * @param pErrorCode an in/out ICU UErrorCode
   1.504 + * @return the number of bytes at data taken up by the trie data
   1.505 + */
   1.506 +U_CAPI int32_t U_EXPORT2
   1.507 +utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode);
   1.508 +
   1.509 +/**
   1.510 + * "Unserialize" a dummy trie.
   1.511 + * A dummy trie is an empty runtime trie, used when a real data trie cannot
   1.512 + * be loaded.
   1.513 + *
   1.514 + * The input memory is filled so that the trie always returns the initialValue,
   1.515 + * or the leadUnitValue for lead surrogate code points.
   1.516 + * The Latin-1 part is always set up to be linear.
   1.517 + *
   1.518 + * @param trie a pointer to the runtime trie structure
   1.519 + * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data
   1.520 + * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE)
   1.521 + * @param initialValue the initial value that is set for all code points
   1.522 + * @param leadUnitValue the value for lead surrogate code _units_ that do not
   1.523 + *                      have associated supplementary data
   1.524 + * @param pErrorCode an in/out ICU UErrorCode
   1.525 + *
   1.526 + * @see UTRIE_DUMMY_SIZE
   1.527 + * @see utrie_open
   1.528 + */
   1.529 +U_CAPI int32_t U_EXPORT2
   1.530 +utrie_unserializeDummy(UTrie *trie,
   1.531 +                       void *data, int32_t length,
   1.532 +                       uint32_t initialValue, uint32_t leadUnitValue,
   1.533 +                       UBool make16BitTrie,
   1.534 +                       UErrorCode *pErrorCode);
   1.535 +
   1.536 +/**
   1.537 + * Default implementation for UTrie.getFoldingOffset, set automatically by
   1.538 + * utrie_unserialize().
   1.539 + * Simply returns the lead surrogate's value itself - which is the inverse
   1.540 + * of the default folding function used by utrie_serialize().
   1.541 + * Exported for static const UTrie structures.
   1.542 + *
   1.543 + * @see UTrieGetFoldingOffset
   1.544 + */
   1.545 +U_CAPI int32_t U_EXPORT2
   1.546 +utrie_defaultGetFoldingOffset(uint32_t data);
   1.547 +
   1.548 +/* Building a trie ----------------------------------------------------------*/
   1.549 +
   1.550 +/**
   1.551 + * Build-time trie structure.
   1.552 + * Opaque definition, here only to make fillIn parameters possible
   1.553 + * for utrie_open() and utrie_clone().
   1.554 + */
   1.555 +struct UNewTrie {
   1.556 +    /**
   1.557 +     * Index values at build-time are 32 bits wide for easier processing.
   1.558 +     * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
   1.559 +     */
   1.560 +    int32_t index[UTRIE_MAX_INDEX_LENGTH];
   1.561 +    uint32_t *data;
   1.562 +
   1.563 +    uint32_t leadUnitValue;
   1.564 +    int32_t indexLength, dataCapacity, dataLength;
   1.565 +    UBool isAllocated, isDataAllocated;
   1.566 +    UBool isLatin1Linear, isCompacted;
   1.567 +
   1.568 +    /**
   1.569 +     * Map of adjusted indexes, used in utrie_compact().
   1.570 +     * Maps from original indexes to new ones.
   1.571 +     */
   1.572 +    int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
   1.573 +};
   1.574 +
   1.575 +typedef struct UNewTrie UNewTrie;
   1.576 +
   1.577 +/**
   1.578 + * Build-time trie callback function, used with utrie_serialize().
   1.579 + * This function calculates a lead surrogate's value including a folding offset
   1.580 + * from the 1024 supplementary code points [start..start+1024[ .
   1.581 + * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
   1.582 + *
   1.583 + * The folding offset is provided by the caller.
   1.584 + * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
   1.585 + * Instead of the offset itself, n can be stored in 10 bits -
   1.586 + * or fewer if it can be assumed that few lead surrogates have associated data.
   1.587 + *
   1.588 + * The returned value must be
   1.589 + * - not zero if and only if there is relevant data
   1.590 + *   for the corresponding 1024 supplementary code points
   1.591 + * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
   1.592 + *
   1.593 + * @return a folded value, or 0 if there is no relevant data for the lead surrogate.
   1.594 + */
   1.595 +typedef uint32_t U_CALLCONV
   1.596 +UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
   1.597 +
   1.598 +/**
   1.599 + * Open a build-time trie structure.
   1.600 + * The size of the build-time data array is specified to avoid allocating a large
   1.601 + * array in all cases. The array itself can also be passed in.
   1.602 + *
   1.603 + * Although the trie is never fully expanded to a linear array, especially when
   1.604 + * utrie_setRange32() is used, the data array could be large during build time.
   1.605 + * The maximum length is
   1.606 + * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
   1.607 + * (Number of Unicode code points + one all-initial-value block +
   1.608 + *  possible duplicate entries for 1024 lead surrogates.)
   1.609 + * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
   1.610 + *
   1.611 + * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
   1.612 + *               NULL if one is to be allocated
   1.613 + * @param aliasData a pointer to a data array to be used (will not be released), or
   1.614 + *                  NULL if one is to be allocated
   1.615 + * @param maxDataLength the capacity of aliasData (if not NULL) or
   1.616 + *                      the length of the data array to be allocated
   1.617 + * @param initialValue the initial value that is set for all code points
   1.618 + * @param leadUnitValue the value for lead surrogate code _units_ that do not
   1.619 + *                      have associated supplementary data
   1.620 + * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
   1.621 + *                     kept in a linear, contiguous part of the data array
   1.622 + * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
   1.623 + */
   1.624 +U_CAPI UNewTrie * U_EXPORT2
   1.625 +utrie_open(UNewTrie *fillIn,
   1.626 +           uint32_t *aliasData, int32_t maxDataLength,
   1.627 +           uint32_t initialValue, uint32_t leadUnitValue,
   1.628 +           UBool latin1Linear);
   1.629 +
   1.630 +/**
   1.631 + * Clone a build-time trie structure with all entries.
   1.632 + *
   1.633 + * @param fillIn like in utrie_open()
   1.634 + * @param other the build-time trie structure to clone
   1.635 + * @param aliasData like in utrie_open(),
   1.636 + *                  used if aliasDataLength>=(capacity of other's data array)
   1.637 + * @param aliasDataLength the length of aliasData
   1.638 + * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
   1.639 + */
   1.640 +U_CAPI UNewTrie * U_EXPORT2
   1.641 +utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
   1.642 +
   1.643 +/**
   1.644 + * Close a build-time trie structure, and release memory
   1.645 + * that was allocated by utrie_open() or utrie_clone().
   1.646 + *
   1.647 + * @param trie the build-time trie
   1.648 + */
   1.649 +U_CAPI void U_EXPORT2
   1.650 +utrie_close(UNewTrie *trie);
   1.651 +
   1.652 +/**
   1.653 + * Get the data array of a build-time trie.
   1.654 + * The data may be modified, but entries that are equal before
   1.655 + * must still be equal after modification.
   1.656 + *
   1.657 + * @param trie the build-time trie
   1.658 + * @param pLength (out) a pointer to a variable that receives the number
   1.659 + *                of entries in the data array
   1.660 + * @return the data array
   1.661 + */
   1.662 +U_CAPI uint32_t * U_EXPORT2
   1.663 +utrie_getData(UNewTrie *trie, int32_t *pLength);
   1.664 +
   1.665 +/**
   1.666 + * Set a value for a code point.
   1.667 + *
   1.668 + * @param trie the build-time trie
   1.669 + * @param c the code point
   1.670 + * @param value the value
   1.671 + * @return FALSE if a failure occurred (illegal argument or data array overrun)
   1.672 + */
   1.673 +U_CAPI UBool U_EXPORT2
   1.674 +utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
   1.675 +
   1.676 +/**
   1.677 + * Get a value from a code point as stored in the build-time trie.
   1.678 + *
   1.679 + * @param trie the build-time trie
   1.680 + * @param c the code point
   1.681 + * @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE
   1.682 + *                     iff the value is retrieved from block 0;
   1.683 + *                     block 0 is the all-initial-value initial block
   1.684 + * @return the value
   1.685 + */
   1.686 +U_CAPI uint32_t U_EXPORT2
   1.687 +utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
   1.688 +
   1.689 +/**
   1.690 + * Set a value in a range of code points [start..limit[.
   1.691 + * All code points c with start<=c<limit will get the value if
   1.692 + * overwrite is TRUE or if the old value is 0.
   1.693 + *
   1.694 + * @param trie the build-time trie
   1.695 + * @param start the first code point to get the value
   1.696 + * @param limit one past the last code point to get the value
   1.697 + * @param value the value
   1.698 + * @param overwrite flag for whether old non-initial values are to be overwritten
   1.699 + * @return FALSE if a failure occurred (illegal argument or data array overrun)
   1.700 + */
   1.701 +U_CAPI UBool U_EXPORT2
   1.702 +utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
   1.703 +
   1.704 +/**
   1.705 + * Compact the build-time trie after all values are set, and then
   1.706 + * serialize it into 32-bit aligned memory.
   1.707 + *
   1.708 + * After this, the trie can only be serizalized again and/or closed;
   1.709 + * no further values can be added.
   1.710 + *
   1.711 + * @see utrie_unserialize()
   1.712 + *
   1.713 + * @param trie the build-time trie
   1.714 + * @param data a pointer to 32-bit-aligned memory for the trie data
   1.715 + * @param capacity the number of bytes available at data
   1.716 + * @param getFoldedValue a callback function that calculates the value for
   1.717 + *                       a lead surrogate from all of its supplementary code points
   1.718 + *                       and the folding offset;
   1.719 + *                       if NULL, then a default function is used which returns just
   1.720 + *                       the input offset when there are any non-initial-value entries
   1.721 + * @param reduceTo16Bits flag for whether the values are to be reduced to a
   1.722 + *                       width of 16 bits for serialization and runtime
   1.723 + * @param pErrorCode a UErrorCode argument; among other possible error codes:
   1.724 + * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
   1.725 + * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
   1.726 + * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
   1.727 + *
   1.728 + * @return the number of bytes written for the trie
   1.729 + */
   1.730 +U_CAPI int32_t U_EXPORT2
   1.731 +utrie_serialize(UNewTrie *trie, void *data, int32_t capacity,
   1.732 +                UNewTrieGetFoldedValue *getFoldedValue,
   1.733 +                UBool reduceTo16Bits,
   1.734 +                UErrorCode *pErrorCode);
   1.735 +
   1.736 +/**
   1.737 + * Swap a serialized UTrie.
   1.738 + * @internal
   1.739 + */
   1.740 +U_CAPI int32_t U_EXPORT2
   1.741 +utrie_swap(const UDataSwapper *ds,
   1.742 +           const void *inData, int32_t length, void *outData,
   1.743 +           UErrorCode *pErrorCode);
   1.744 +
   1.745 +/* serialization ------------------------------------------------------------ */
   1.746 +
   1.747 +/**
   1.748 + * Trie data structure in serialized form:
   1.749 + *
   1.750 + * UTrieHeader header;
   1.751 + * uint16_t index[header.indexLength];
   1.752 + * uint16_t data[header.dataLength];
   1.753 + * @internal
   1.754 + */
   1.755 +typedef struct UTrieHeader {
   1.756 +    /** "Trie" in big-endian US-ASCII (0x54726965) */
   1.757 +    uint32_t signature;
   1.758 +
   1.759 +    /**
   1.760 +     * options bit field:
   1.761 +     *     9    1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
   1.762 +     *     8    0=16-bit data, 1=32-bit data
   1.763 +     *  7..4    UTRIE_INDEX_SHIFT   // 0..UTRIE_SHIFT
   1.764 +     *  3..0    UTRIE_SHIFT         // 1..9
   1.765 +     */
   1.766 +    uint32_t options;
   1.767 +
   1.768 +    /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
   1.769 +    int32_t indexLength;
   1.770 +
   1.771 +    /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
   1.772 +    int32_t dataLength;
   1.773 +} UTrieHeader;
   1.774 +
   1.775 +/**
   1.776 + * Constants for use with UTrieHeader.options.
   1.777 + * @internal
   1.778 + */
   1.779 +enum {
   1.780 +    /** Mask to get the UTRIE_SHIFT value from options. */
   1.781 +    UTRIE_OPTIONS_SHIFT_MASK=0xf,
   1.782 +
   1.783 +    /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
   1.784 +    UTRIE_OPTIONS_INDEX_SHIFT=4,
   1.785 +
   1.786 +    /** If set, then the data (stage 2) array is 32 bits wide. */
   1.787 +    UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
   1.788 +
   1.789 +    /**
   1.790 +     * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
   1.791 +     * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
   1.792 +     */
   1.793 +    UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
   1.794 +};
   1.795 +
   1.796 +U_CDECL_END
   1.797 +
   1.798 +#endif

mercurial