1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol_imp.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1093 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1998-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* 1.12 +* Private implementation header for C collation 1.13 +* file name: ucol_imp.h 1.14 +* encoding: US-ASCII 1.15 +* tab size: 8 (not used) 1.16 +* indentation:4 1.17 +* 1.18 +* created on: 2000dec11 1.19 +* created by: Vladimir Weinstein 1.20 +* 1.21 +* Modification history 1.22 +* Date Name Comments 1.23 +* 02/16/2001 synwee Added UCOL_GETPREVCE for the use in ucoleitr 1.24 +* 02/27/2001 synwee Added getMaxExpansion data structure in UCollator 1.25 +* 03/02/2001 synwee Added UCOL_IMPLICIT_CE 1.26 +* 03/12/2001 synwee Added pointer start to collIterate. 1.27 +*/ 1.28 + 1.29 +#ifndef UCOL_IMP_H 1.30 +#define UCOL_IMP_H 1.31 + 1.32 +#include "unicode/utypes.h" 1.33 +#ifdef __cplusplus 1.34 +# include "unicode/utf16.h" 1.35 +#endif 1.36 + 1.37 +#define UCA_DATA_TYPE "icu" 1.38 +#define UCA_DATA_NAME "ucadata" 1.39 +#define INVC_DATA_TYPE "icu" 1.40 +#define INVC_DATA_NAME "invuca" 1.41 + 1.42 +/** 1.43 + * Convenience string denoting the Collation data tree 1.44 + * @internal ICU 3.0 1.45 + */ 1.46 +#define U_ICUDATA_COLL U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll" 1.47 + 1.48 +#if !UCONFIG_NO_COLLATION 1.49 + 1.50 +#ifdef __cplusplus 1.51 +#include "unicode/normalizer2.h" 1.52 +#include "unicode/unistr.h" 1.53 +#endif 1.54 +#include "unicode/ucol.h" 1.55 +#include "ucol_data.h" 1.56 +#include "utrie.h" 1.57 +#include "cmemory.h" 1.58 + 1.59 +/* This is the internal header file which contains important declarations for 1.60 + * the collation framework. 1.61 + * Ready to use collators are stored as binary images. Both UCA and tailorings 1.62 + * share the same binary format. Individual files (currently only UCA) have a 1.63 + * udata header in front of the image and should be opened using udata_open. 1.64 + * Tailoring images are currently stored inside resource bundles and are intialized 1.65 + * through ucol_open API. 1.66 + * 1.67 + * The following describes the formats for collation binaries 1.68 + * (UCA & tailorings) and for the inverse UCA table. 1.69 + * Substructures are described in the collation design document at 1.70 + * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm 1.71 + * 1.72 + * ------------------------------------------------------------- 1.73 + * 1.74 + * Here is the format of binary collation image. 1.75 + * 1.76 + * Physical order of structures: 1.77 + * - header (UCATableHeader) 1.78 + * - options (UColOptionSet) 1.79 + * - expansions (CE[]) 1.80 + * - contractions (UChar[contractionSize] + CE[contractionSize]) 1.81 + * - serialized UTrie with mappings of code points to CEs 1.82 + * - max expansion tables (CE[endExpansionCECount] + uint8_t[endExpansionCECount]) 1.83 + * - two bit sets for backward processing in strcoll (identical prefixes) 1.84 + * and for backward CE iteration (each set is uint8_t[UCOL_UNSAFECP_TABLE_SIZE]) 1.85 + * - UCA constants (UCAConstants) 1.86 + * - UCA contractions (UChar[contractionUCACombosSize][contractionUCACombosWidth]) 1.87 + * 1.88 + * UCATableHeader fields: 1.89 + * 1.90 + * int32_t size; - image size in bytes 1.91 + * 1.92 + * Offsets to interesting data. All offsets are in bytes. 1.93 + * to get the address add to the header address and cast properly. 1.94 + * Some offsets are zero if the corresponding structures are empty. 1.95 + * 1.96 + * Tailoring binaries that only set options and contain no mappings etc. 1.97 + * will have all offsets 0 except for the options and expansion offsets, 1.98 + * which give the position and length of the options array. 1.99 + * 1.100 + * uint32_t options; - offset to default collator options (UColOptionSet *), 1.101 + * a set of 32-bit values. See declaration of UColOptionSet for more details 1.102 + * 1.103 + * uint32_t UCAConsts; - only used (!=0) in UCA image - structure which holds values for indirect positioning and implicit ranges 1.104 + * See declaration of UCAConstants structure. This is a set of unsigned 32-bit values used to store 1.105 + * important constant values that are defined in the UCA and used for building and runtime. 1.106 + * 1.107 + * uint32_t contractionUCACombos; - only used (!=0) in UCA image - list of UCA contractions. This is a zero terminated array of UChar[contractionUCACombosWidth], 1.108 + * containing contractions from the UCA. These are needed in the build process to copy UCA contractions 1.109 + * in case the base contraction symbol is tailored. 1.110 + * 1.111 + * uint32_t magic; - must contain UCOL_HEADER_MAGIC (formatVersion 2.3) 1.112 + * 1.113 + * uint32_t mappingPosition; - offset to UTrie (const uint8_t *mappingPosition). This is a serialized UTrie and should be treated as such. 1.114 + * Used as a primary lookup table for collation elements. 1.115 + * 1.116 + * uint32_t expansion; - offset to expansion table (uint32_t *expansion). This is an array of expansion CEs. Never 0. 1.117 + * 1.118 + * uint32_t contractionIndex; - offset to contraction table (UChar *contractionIndex). Used to look up contraction sequences. Contents 1.119 + * are aligned with the contents of contractionCEs table. 0 if no contractions. 1.120 + * 1.121 + * uint32_t contractionCEs; - offset to resulting contraction CEs (uint32_t *contractionCEs). When a contraction is resolved in the 1.122 + * in the contractionIndex table, the resulting index is used to look up corresponding CE in this table. 1.123 + * 0 if no contractions. 1.124 + * uint32_t contractionSize; - size of contraction table in elements (both Index and CEs). 1.125 + * 1.126 + * Tables described below are used for Boyer-Moore searching algorithm - they define the size of longest expansion 1.127 + * and last CEs in expansions. 1.128 + * uint32_t endExpansionCE; - offset to array of last collation element in expansion (uint32_t *). 1.129 + * Never 0. 1.130 + * uint32_t expansionCESize; - array of maximum expansion sizes (uint8_t *) 1.131 + * int32_t endExpansionCECount; - size of endExpansionCE. See UCOL_GETMAXEXPANSION 1.132 + * for the usage model 1.133 + * 1.134 + * These two offsets point to byte tables that are used in the backup heuristics. 1.135 + * uint32_t unsafeCP; - hash table of unsafe code points (uint8_t *). See ucol_unsafeCP function. 1.136 + * uint32_t contrEndCP; - hash table of final code points in contractions (uint8_t *). See ucol_contractionEndCP. 1.137 + * 1.138 + * int32_t contractionUCACombosSize; - number of UChar[contractionUCACombosWidth] in contractionUCACombos 1.139 + * (formatVersion 2.3) 1.140 + * UBool jamoSpecial; - Jamo special indicator (uint8_t). If TRUE, Jamos are special, so we cannot use simple Hangul decomposition. 1.141 + * UBool isBigEndian; - endianness of this collation binary (formatVersion 2.3) 1.142 + * uint8_t charSetFamily; - charset family of this collation binary (formatVersion 2.3) 1.143 + * uint8_t contractionUCACombosWidth; - number of UChars per UCA contraction in contractionUCACombos (formatVersion 2.3) 1.144 + * 1.145 + * Various version fields 1.146 + * UVersionInfo version; - version 4 uint8_t 1.147 + * UVersionInfo UCAVersion; - version 4 uint8_t 1.148 + * UVersionInfo UCDVersion; - version 4 uint8_t 1.149 + * UVersionInfo formatVersion; - version of the format of the collation binary 1.150 + * same formatVersion as in ucadata.icu's UDataInfo header 1.151 + * (formatVersion 2.3) 1.152 + * 1.153 + * uint32_t offset to the reordering code to lead CE byte remapping table 1.154 + * uint32_t offset to the lead CE byte to reordering code mapping table 1.155 + * 1.156 + * uint8_t reserved[76]; - currently unused 1.157 + * 1.158 + * ------------------------------------------------------------- 1.159 + * 1.160 + * Inverse UCA is used for constructing collators from rules. It is always an individual file 1.161 + * and always has a UDataInfo header. 1.162 + * here is the structure: 1.163 + * 1.164 + * uint32_t byteSize; - size of inverse UCA image in bytes 1.165 + * uint32_t tableSize; - length of inverse table (number of uint32_t[3] rows) 1.166 + * uint32_t contsSize; - size of continuation table (number of UChars in table) 1.167 + * 1.168 + * uint32_t table; - offset to inverse table (uint32_t *) 1.169 + * Inverse table contains of rows of 3 uint32_t values. First two values are CE and a possible continuation 1.170 + * the third value is either a code unit (if there is only one code unit for element) or an index to continuation 1.171 + * (number of code units combined with an index). 1.172 + * table. If more than one codepoint have the same CE, continuation table contains code units separated by FFFF and final 1.173 + * code unit sequence for a CE is terminated by FFFE. 1.174 + * uint32_t conts; - offset to continuation table (uint16_t *). Contains code units that transform to a same CE. 1.175 + * 1.176 + * UVersionInfo UCAVersion; - version of the UCA, read from file 4 uint8_t 1.177 + * uint8_t padding[8]; - padding 8 uint8_t 1.178 + * Header is followed by the table and continuation table. 1.179 +*/ 1.180 + 1.181 +/* definition of UCOL_HEADER_MAGIC moved to common/ucol_data.h */ 1.182 + 1.183 +/* UDataInfo for UCA mapping table */ 1.184 +/* dataFormat="UCol" */ 1.185 +#define UCA_DATA_FORMAT_0 ((uint8_t)0x55) 1.186 +#define UCA_DATA_FORMAT_1 ((uint8_t)0x43) 1.187 +#define UCA_DATA_FORMAT_2 ((uint8_t)0x6f) 1.188 +#define UCA_DATA_FORMAT_3 ((uint8_t)0x6c) 1.189 + 1.190 +#define UCA_FORMAT_VERSION_0 ((uint8_t)3) 1.191 +#define UCA_FORMAT_VERSION_1 0 1.192 +#define UCA_FORMAT_VERSION_2 ((uint8_t)0) 1.193 +#define UCA_FORMAT_VERSION_3 ((uint8_t)0) 1.194 + 1.195 +/* UDataInfo for inverse UCA table */ 1.196 +/* dataFormat="InvC" */ 1.197 +#define INVUCA_DATA_FORMAT_0 ((uint8_t)0x49) 1.198 +#define INVUCA_DATA_FORMAT_1 ((uint8_t)0x6E) 1.199 +#define INVUCA_DATA_FORMAT_2 ((uint8_t)0x76) 1.200 +#define INVUCA_DATA_FORMAT_3 ((uint8_t)0x43) 1.201 + 1.202 +#define INVUCA_FORMAT_VERSION_0 ((uint8_t)2) 1.203 +#define INVUCA_FORMAT_VERSION_1 ((uint8_t)1) 1.204 +#define INVUCA_FORMAT_VERSION_2 ((uint8_t)0) 1.205 +#define INVUCA_FORMAT_VERSION_3 ((uint8_t)0) 1.206 + 1.207 +/* This is the size of the stack allocated buffer for sortkey generation and similar operations */ 1.208 +/* if it is too small, heap allocation will occur.*/ 1.209 +/* you can change this value if you need memory - it will affect the performance, though, since we're going to malloc */ 1.210 +#define UCOL_MAX_BUFFER 128 1.211 + 1.212 +#define UCOL_NORMALIZATION_GROWTH 2 1.213 +#define UCOL_NORMALIZATION_MAX_BUFFER UCOL_MAX_BUFFER*UCOL_NORMALIZATION_GROWTH 1.214 + 1.215 +/* This writable buffer is used if we encounter Thai and need to reorder the string on the fly */ 1.216 +/* Sometimes we already have a writable buffer (like in case of normalized strings). */ 1.217 +/* 1.218 +you can change this value to any value >= 4 if you need memory - 1.219 +it will affect the performance, though, since we're going to malloc. 1.220 +Note 3 is the minimum value for Thai collation and 4 is the 1.221 +minimum number for special Jamo 1.222 +*/ 1.223 +#define UCOL_WRITABLE_BUFFER_SIZE 256 1.224 + 1.225 +/* This is the size of the buffer for expansion CE's */ 1.226 +/* In reality we should not have to deal with expm sequences longer then 16 */ 1.227 +/* you can change this value if you need memory */ 1.228 +/* WARNING THIS BUFFER DOES HAVE MALLOC FALLBACK. If you make it too small, you'll get into performance trouble */ 1.229 +/* Reasonable small value is around 10, if you don't do Arabic or other funky collations that have long expansion sequence */ 1.230 +/* This is the longest expansion sequence we can handle without bombing out */ 1.231 +#define UCOL_EXPAND_CE_BUFFER_SIZE 64 1.232 + 1.233 +/* This is the size to increase the buffer for expansion CE's */ 1.234 +#define UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE 64 1.235 + 1.236 + 1.237 +/* Unsafe UChar hash table table size. */ 1.238 +/* size is 32 bytes for 1 bit for each latin 1 char + some power of two for */ 1.239 +/* hashing the rest of the chars. Size in bytes */ 1.240 +#define UCOL_UNSAFECP_TABLE_SIZE 1056 1.241 + /* mask value down to "some power of two"-1 */ 1.242 + /* number of bits, not num of bytes. */ 1.243 +#define UCOL_UNSAFECP_TABLE_MASK 0x1fff 1.244 + 1.245 + 1.246 +/* flags bits for collIterate.flags */ 1.247 +/* */ 1.248 +/* NORM - set for incremental normalize of source string */ 1.249 +#define UCOL_ITER_NORM 1 1.250 + 1.251 +#define UCOL_ITER_HASLEN 2 1.252 + 1.253 + /* UCOL_ITER_INNORMBUF - set if the "pos" is in */ 1.254 + /* the writable side buffer, handling */ 1.255 + /* incrementally normalized characters. */ 1.256 +#define UCOL_ITER_INNORMBUF 4 1.257 + 1.258 + /* UCOL_ITER_ALLOCATED - set if this iterator has */ 1.259 + /* malloced storage to expand a buffer. */ 1.260 +#define UCOL_ITER_ALLOCATED 8 1.261 + /* UCOL_HIRAGANA_Q - note if the codepoint was hiragana */ 1.262 +#define UCOL_HIRAGANA_Q 16 1.263 + /* UCOL_WAS_HIRAGANA - set to TRUE if there was a Hiragana */ 1.264 + /* otherwise set to false */ 1.265 +#define UCOL_WAS_HIRAGANA 32 1.266 + /* UCOL_USE_ITERATOR - set this if collIterate uses a */ 1.267 + /* character iterator instead of simply accessing string */ 1.268 + /* by index */ 1.269 +#define UCOL_USE_ITERATOR 64 1.270 + 1.271 +#define UCOL_FORCE_HAN_IMPLICIT 128 1.272 + 1.273 +#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 1.274 + 1.275 +#ifdef __cplusplus 1.276 + 1.277 +U_NAMESPACE_BEGIN 1.278 + 1.279 +typedef struct collIterate : public UMemory { 1.280 + const UChar *string; /* Original string */ 1.281 + /* UChar *start; Pointer to the start of the source string. Either points to string 1.282 + or to writableBuffer */ 1.283 + const UChar *endp; /* string end ptr. Is undefined for null terminated strings */ 1.284 + const UChar *pos; /* This is position in the string. Can be to original or writable buf */ 1.285 + 1.286 + uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */ 1.287 + uint32_t *CEpos; /* This is the position to which we have stored processed CEs */ 1.288 + 1.289 + int32_t *offsetReturn; /* This is the offset to return, if non-NULL */ 1.290 + int32_t *offsetStore; /* This is the pointer for storing offsets */ 1.291 + int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */ 1.292 + int32_t offsetRepeatValue; /* offset value to repeat */ 1.293 + 1.294 + UnicodeString writableBuffer; 1.295 + const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */ 1.296 + const UCollator *coll; 1.297 + const Normalizer2 *nfd; 1.298 + uint8_t flags; 1.299 + uint8_t origFlags; 1.300 + uint32_t *extendCEs; /* This is use if CEs is not big enough */ 1.301 + int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */ 1.302 + uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */ 1.303 + 1.304 + int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */ 1.305 + int32_t offsetBufferSize; /* The size of the offset buffer */ 1.306 + 1.307 + UCharIterator *iterator; 1.308 + /*int32_t iteratorIndex;*/ 1.309 + 1.310 + // The offsetBuffer should probably be a UVector32, but helper functions 1.311 + // are an improvement over duplicated code. 1.312 + void appendOffset(int32_t offset, UErrorCode &errorCode); 1.313 +} collIterate; 1.314 + 1.315 +U_NAMESPACE_END 1.316 + 1.317 +#else 1.318 + 1.319 +typedef struct collIterate collIterate; 1.320 + 1.321 +#endif 1.322 + 1.323 +#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0)) 1.324 +#define headersize (paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))) 1.325 + 1.326 +/* 1.327 +struct used internally in getSpecial*CE. 1.328 +data similar to collIterate. 1.329 +*/ 1.330 +struct collIterateState { 1.331 + const UChar *pos; /* This is position in the string. Can be to original or writable buf */ 1.332 + const UChar *returnPos; 1.333 + const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */ 1.334 + const UChar *bufferaddress; /* address of the normalization buffer */ 1.335 + int32_t buffersize; 1.336 + uint8_t flags; 1.337 + uint8_t origFlags; 1.338 + uint32_t iteratorIndex; 1.339 + int32_t iteratorMove; 1.340 +}; 1.341 + 1.342 +U_CAPI void U_EXPORT2 1.343 +uprv_init_collIterate(const UCollator *collator, 1.344 + const UChar *sourceString, int32_t sourceLen, 1.345 + U_NAMESPACE_QUALIFIER collIterate *s, UErrorCode *status); 1.346 + 1.347 +/* Internal functions for C test code. */ 1.348 +U_CAPI U_NAMESPACE_QUALIFIER collIterate * U_EXPORT2 1.349 +uprv_new_collIterate(UErrorCode *status); 1.350 + 1.351 +U_CAPI void U_EXPORT2 1.352 +uprv_delete_collIterate(U_NAMESPACE_QUALIFIER collIterate *s); 1.353 + 1.354 +/* @return s->pos == s->endp */ 1.355 +U_CAPI UBool U_EXPORT2 1.356 +uprv_collIterateAtEnd(U_NAMESPACE_QUALIFIER collIterate *s); 1.357 + 1.358 +#ifdef __cplusplus 1.359 + 1.360 +U_NAMESPACE_BEGIN 1.361 + 1.362 +struct UCollationPCE; 1.363 +typedef struct UCollationPCE UCollationPCE; 1.364 + 1.365 +U_NAMESPACE_END 1.366 + 1.367 +struct UCollationElements : public icu::UMemory 1.368 +{ 1.369 + /** 1.370 + * Struct wrapper for source data 1.371 + */ 1.372 + icu::collIterate iteratordata_; 1.373 + /** 1.374 + * Indicates if this data has been reset. 1.375 + */ 1.376 + UBool reset_; 1.377 + /** 1.378 + * Indicates if the data should be deleted. 1.379 + */ 1.380 + UBool isWritable; 1.381 + 1.382 +/** 1.383 + * Data for getNextProcessed, getPreviousProcessed. 1.384 + */ 1.385 + icu::UCollationPCE *pce; 1.386 +}; 1.387 + 1.388 +#else 1.389 +/*opaque type*/ 1.390 +struct UCollationElements; 1.391 +#endif 1.392 + 1.393 +U_CAPI void U_EXPORT2 1.394 +uprv_init_pce(const struct UCollationElements *elems); 1.395 + 1.396 +#define UCOL_LEVELTERMINATOR 1 1.397 + 1.398 +/* mask off anything but primary order */ 1.399 +#define UCOL_PRIMARYORDERMASK 0xffff0000 1.400 +/* mask off anything but secondary order */ 1.401 +#define UCOL_SECONDARYORDERMASK 0x0000ff00 1.402 +/* mask off anything but tertiary order */ 1.403 +#define UCOL_TERTIARYORDERMASK 0x000000ff 1.404 +/* primary order shift */ 1.405 +#define UCOL_PRIMARYORDERSHIFT 16 1.406 +/* secondary order shift */ 1.407 +#define UCOL_SECONDARYORDERSHIFT 8 1.408 + 1.409 +#define UCOL_BYTE_SIZE_MASK 0xFF 1.410 + 1.411 +#define UCOL_CASE_BYTE_START 0x80 1.412 +#define UCOL_CASE_SHIFT_START 7 1.413 + 1.414 +#define UCOL_IGNORABLE 0 1.415 + 1.416 +/* get weights from a CE */ 1.417 +#define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT) 1.418 +#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) 1.419 +#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) 1.420 + 1.421 +/** 1.422 + * Determine if a character is a Thai vowel (which sorts after 1.423 + * its base consonant). 1.424 + */ 1.425 +#define UCOL_ISTHAIPREVOWEL(ch) ((((uint32_t)(ch) - 0xe40) <= (0xe44 - 0xe40)) || \ 1.426 + (((uint32_t)(ch) - 0xec0) <= (0xec4 - 0xec0))) 1.427 + 1.428 +/** 1.429 + * Determine if a character is a Thai base consonant 1.430 + */ 1.431 +#define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01) 1.432 + 1.433 +#define UCOL_ISJAMO(ch) ((((uint32_t)(ch) - 0x1100) <= (0x1112 - 0x1100)) || \ 1.434 + (((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \ 1.435 + (((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8))) 1.436 + 1.437 +/* Han character ranges */ 1.438 +#define UCOL_FIRST_HAN 0x4E00 1.439 +#define UCOL_LAST_HAN 0x9FFF 1.440 +#define UCOL_FIRST_HAN_A 0x3400 1.441 +#define UCOL_LAST_HAN_A 0x4DBF 1.442 +#define UCOL_FIRST_HAN_COMPAT 0xFAE0 1.443 +#define UCOL_LAST_HAN_COMPAT 0xFA2F 1.444 + 1.445 +/* Han extension B is in plane 2 */ 1.446 +#define UCOL_FIRST_HAN_B 0x20000 1.447 +#define UCOL_LAST_HAN_B 0x2A6DF 1.448 + 1.449 +/* Hangul range */ 1.450 +#define UCOL_FIRST_HANGUL 0xAC00 1.451 +#define UCOL_LAST_HANGUL 0xD7AF 1.452 + 1.453 +/* Jamo ranges */ 1.454 +#define UCOL_FIRST_L_JAMO 0x1100 1.455 +#define UCOL_FIRST_V_JAMO 0x1161 1.456 +#define UCOL_FIRST_T_JAMO 0x11A8 1.457 +#define UCOL_LAST_T_JAMO 0x11F9 1.458 + 1.459 + 1.460 +#if 0 1.461 +/* initializes collIterate structure */ 1.462 +/* made as macro to speed up things */ 1.463 +#define init_collIterate(collator, sourceString, sourceLen, s) { \ 1.464 + (s)->start = (s)->string = (s)->pos = (UChar *)(sourceString); \ 1.465 + (s)->endp = (sourceLen) == -1 ? NULL :(UChar *)(sourceString)+(sourceLen); \ 1.466 + (s)->CEpos = (s)->toReturn = (s)->CEs; \ 1.467 + (s)->isThai = TRUE; \ 1.468 + (s)->writableBuffer = (s)->stackWritableBuffer; \ 1.469 + (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; \ 1.470 + (s)->coll = (collator); \ 1.471 + (s)->fcdPosition = 0; \ 1.472 + (s)->flags = 0; \ 1.473 + if(((collator)->normalizationMode == UCOL_ON)) (s)->flags |= UCOL_ITER_NORM; \ 1.474 +} 1.475 +#endif 1.476 + 1.477 + 1.478 + 1.479 +/* 1.480 +* Macro to get the maximum size of an expansion ending with the argument ce. 1.481 +* Used in the Boyer Moore algorithm. 1.482 +* Note for tailoring, the UCA maxexpansion table has been merged. 1.483 +* Hence we only have to search the tailored collator only. 1.484 +* @param coll const UCollator pointer 1.485 +* @param order last collation element of the expansion sequence 1.486 +* @param result size of the longest expansion with argument collation element 1.487 +* as the last element 1.488 +*/ 1.489 +#define UCOL_GETMAXEXPANSION(coll, order, result) { \ 1.490 + const uint32_t *start; \ 1.491 + const uint32_t *limit; \ 1.492 + const uint32_t *mid; \ 1.493 + start = (coll)->endExpansionCE; \ 1.494 + limit = (coll)->lastEndExpansionCE; \ 1.495 + while (start < limit - 1) { \ 1.496 + mid = start + ((limit - start) >> 1); \ 1.497 + if ((order) <= *mid) { \ 1.498 + limit = mid; \ 1.499 + } \ 1.500 + else { \ 1.501 + start = mid; \ 1.502 + } \ 1.503 + } \ 1.504 + if (*start == order) { \ 1.505 + result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); \ 1.506 + } \ 1.507 + else if (*limit == order) { \ 1.508 + result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); \ 1.509 + } \ 1.510 + else if ((order & 0xFFFF) == 0x00C0) { \ 1.511 + result = 2; \ 1.512 + } \ 1.513 + else { \ 1.514 + result = 1; \ 1.515 + } \ 1.516 +} 1.517 + 1.518 +U_CFUNC 1.519 +uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, 1.520 + U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status); 1.521 + 1.522 +U_CFUNC 1.523 +uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 1.524 + U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status); 1.525 +U_CAPI uint32_t U_EXPORT2 ucol_getNextCE(const UCollator *coll, 1.526 + U_NAMESPACE_QUALIFIER collIterate *collationSource, UErrorCode *status); 1.527 +U_CFUNC uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll, 1.528 + U_NAMESPACE_QUALIFIER collIterate *collationSource, 1.529 + UErrorCode *status); 1.530 +/* get some memory */ 1.531 +void *ucol_getABuffer(const UCollator *coll, uint32_t size); 1.532 + 1.533 +#ifdef __cplusplus 1.534 + 1.535 +U_NAMESPACE_BEGIN 1.536 + 1.537 +class CollationKey; 1.538 +class SortKeyByteSink; 1.539 + 1.540 +U_NAMESPACE_END 1.541 + 1.542 +/* function used by C++ getCollationKey to prevent restarting the calculation */ 1.543 +U_CFUNC int32_t 1.544 +ucol_getCollationKey(const UCollator *coll, 1.545 + const UChar *source, int32_t sourceLength, 1.546 + icu::CollationKey &key, 1.547 + UErrorCode &errorCode); 1.548 + 1.549 +typedef void U_CALLCONV 1.550 +SortKeyGenerator(const UCollator *coll, 1.551 + const UChar *source, 1.552 + int32_t sourceLength, 1.553 + icu::SortKeyByteSink &result, 1.554 + UErrorCode *status); 1.555 + 1.556 +/* worker function for generating sortkeys */ 1.557 +U_CFUNC 1.558 +void U_CALLCONV 1.559 +ucol_calcSortKey(const UCollator *coll, 1.560 + const UChar *source, 1.561 + int32_t sourceLength, 1.562 + icu::SortKeyByteSink &result, 1.563 + UErrorCode *status); 1.564 + 1.565 +U_CFUNC 1.566 +void U_CALLCONV 1.567 +ucol_calcSortKeySimpleTertiary(const UCollator *coll, 1.568 + const UChar *source, 1.569 + int32_t sourceLength, 1.570 + icu::SortKeyByteSink &result, 1.571 + UErrorCode *status); 1.572 + 1.573 +#else 1.574 + 1.575 +typedef void U_CALLCONV 1.576 +SortKeyGenerator(const UCollator *coll, 1.577 + const UChar *source, 1.578 + int32_t sourceLength, 1.579 + void *result, 1.580 + UErrorCode *status); 1.581 + 1.582 +#endif 1.583 + 1.584 +/** 1.585 + * Used to set requested and valid locales on a collator returned by the collator 1.586 + * service. 1.587 + */ 1.588 +U_CFUNC void U_EXPORT2 1.589 +ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt, char *actualLocaleToAdopt); 1.590 + 1.591 +#define UCOL_SPECIAL_FLAG 0xF0000000 1.592 +#define UCOL_TAG_SHIFT 24 1.593 +#define UCOL_TAG_MASK 0x0F000000 1.594 +#define INIT_EXP_TABLE_SIZE 1024 1.595 +#define UCOL_NOT_FOUND 0xF0000000 1.596 +#define UCOL_EXPANSION 0xF1000000 1.597 +#define UCOL_CONTRACTION 0xF2000000 1.598 +#define UCOL_THAI 0xF3000000 1.599 +#define UCOL_UNMARKED 0x03 1.600 +#define UCOL_NEW_TERTIARYORDERMASK 0x0000003f 1.601 + 1.602 +/* Bit mask for primary collation strength. */ 1.603 +#define UCOL_PRIMARYMASK 0xFFFF0000 1.604 + 1.605 +/* Bit mask for secondary collation strength. */ 1.606 +#define UCOL_SECONDARYMASK 0x0000FF00 1.607 + 1.608 +/* Bit mask for tertiary collation strength. */ 1.609 +#define UCOL_TERTIARYMASK 0x000000FF 1.610 + 1.611 +/** 1.612 + * Internal. 1.613 + * This indicates the last element in a UCollationElements has been consumed. 1.614 + * Compare with the UCOL_NULLORDER, UCOL_NULLORDER is returned if error occurs. 1.615 + */ 1.616 +#define UCOL_NO_MORE_CES 0x00010101 1.617 +#define UCOL_NO_MORE_CES_PRIMARY 0x00010000 1.618 +#define UCOL_NO_MORE_CES_SECONDARY 0x00000100 1.619 +#define UCOL_NO_MORE_CES_TERTIARY 0x00000001 1.620 + 1.621 +#define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF) 1.622 + 1.623 +#define UCOL_UPPER_CASE 0x80 1.624 +#define UCOL_MIXED_CASE 0x40 1.625 +#define UCOL_LOWER_CASE 0x00 1.626 + 1.627 +#define UCOL_CONTINUATION_MARKER 0xC0 1.628 +#define UCOL_REMOVE_CONTINUATION 0xFFFFFF3F 1.629 + 1.630 +#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) 1.631 +#define isFlagged(CE) (((CE) & 0x80) == 0x80) 1.632 +#define isLongPrimary(CE) (((CE) & 0xC0) == 0xC0) 1.633 + 1.634 +#define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT) 1.635 +#define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG)) 1.636 +#define isPrefix(CE) (isSpecial((CE)) && (getCETag((CE)) == SPEC_PROC_TAG)) 1.637 +#define constructContractCE(tag, CE) (UCOL_SPECIAL_FLAG | ((tag)<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF)) 1.638 +#define constructSpecProcCE(CE) (UCOL_SPECIAL_FLAG | (SPEC_PROC_TAG<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF)) 1.639 +#define getContractOffset(CE) ((CE)&0xFFFFFF) 1.640 +#define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4) 1.641 +#define getExpansionCount(CE) ((CE)&0xF) 1.642 +#define isCEIgnorable(CE) (((CE) & 0xFFFFFFBF) == 0) 1.643 + 1.644 +/* StringSearch internal use */ 1.645 +#define inNormBuf(coleiter) ((coleiter)->iteratordata_.flags & UCOL_ITER_INNORMBUF) 1.646 +#define isFCDPointerNull(coleiter) ((coleiter)->iteratordata_.fcdPosition == NULL) 1.647 +#define hasExpansion(coleiter) ((coleiter)->iteratordata_.CEpos != (coleiter)->iteratordata_.CEs) 1.648 +#define getExpansionPrefix(coleiter) ((coleiter)->iteratordata_.toReturn - (coleiter)->iteratordata_.CEs) 1.649 +#define setExpansionPrefix(coleiter, offset) ((coleiter)->iteratordata_.CEs + offset) 1.650 +#define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn) 1.651 +#define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces) 1.652 + 1.653 +/* This is an enum that lists magic special byte values from the fractional UCA. 1.654 + * See also http://site.icu-project.org/design/collation/bytes */ 1.655 +/* TODO: all the #defines that refer to special byte values from the UCA should be changed to point here */ 1.656 + 1.657 +enum { 1.658 + UCOL_BYTE_ZERO = 0x00, 1.659 + UCOL_BYTE_LEVEL_SEPARATOR = 0x01, 1.660 + UCOL_BYTE_SORTKEY_GLUE = 0x02, 1.661 + UCOL_BYTE_SHIFT_PREFIX = 0x03, 1.662 + UCOL_BYTE_UNSHIFTED_MIN = UCOL_BYTE_SHIFT_PREFIX, 1.663 + UCOL_BYTE_FIRST_TAILORED = 0x04, 1.664 + UCOL_BYTE_COMMON = 0x05, 1.665 + UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON, 1.666 + /* TODO: Make the following values dynamic since they change with almost every UCA version. */ 1.667 + UCOL_CODAN_PLACEHOLDER = 0x12, 1.668 + UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x5B, 1.669 + UCOL_BYTE_UNSHIFTED_MAX = 0xFF 1.670 +}; 1.671 + 1.672 +#if 0 1.673 +#define UCOL_RESET_TOP_VALUE 0x9F000303 1.674 +#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705 1.675 +#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05 1.676 +#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x000051C0 1.677 +#define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000 1.678 +#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000500 1.679 +#define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000 1.680 +#define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000 1.681 +#define UCOL_FIRST_VARIABLE 0x05070505 1.682 +#define UCOL_LAST_VARIABLE 0x179B0505 1.683 +#define UCOL_FIRST_NON_VARIABLE 0x1A200505 1.684 +#define UCOL_LAST_NON_VARIABLE 0x7B41058F 1.685 + 1.686 +#define UCOL_NEXT_TOP_VALUE 0xE8960303 1.687 +#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905 1.688 +#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303 1.689 +#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705 1.690 +#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000500 1.691 +#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00000000 1.692 +#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00000000 1.693 +#define UCOL_NEXT_FIRST_VARIABLE 0x05090505 1.694 +#define UCOL_NEXT_LAST_VARIABLE 0x1A200505 1.695 + 1.696 +#define PRIMARY_IMPLICIT_MIN 0xE8000000 1.697 +#define PRIMARY_IMPLICIT_MAX 0xF0000000 1.698 +#endif 1.699 + 1.700 +/* These constants can be changed - sortkey size is affected by them */ 1.701 +#define UCOL_PROPORTION2 0.5 1.702 +#define UCOL_PROPORTION3 0.667 1.703 + 1.704 +/* These values come from the UCA */ 1.705 +#define UCOL_COMMON_BOT2 UCOL_BYTE_COMMON 1.706 +#define UCOL_COMMON_TOP2 0x86u 1.707 +#define UCOL_TOTAL2 (UCOL_COMMON_TOP2-UCOL_COMMON_BOT2-1) 1.708 + 1.709 +#define UCOL_FLAG_BIT_MASK_CASE_SW_OFF 0x80 1.710 +#define UCOL_FLAG_BIT_MASK_CASE_SW_ON 0x40 1.711 +#define UCOL_COMMON_TOP3_CASE_SW_OFF 0x85 1.712 +#define UCOL_COMMON_TOP3_CASE_SW_LOWER 0x45 1.713 +#define UCOL_COMMON_TOP3_CASE_SW_UPPER 0xC5 1.714 + 1.715 +/* These values come from the UCA */ 1.716 +#define UCOL_COMMON_BOT3 0x05 1.717 + 1.718 +#define UCOL_COMMON_BOTTOM3_CASE_SW_UPPER 0x86; 1.719 +#define UCOL_COMMON_BOTTOM3_CASE_SW_LOWER UCOL_COMMON_BOT3; 1.720 + 1.721 +#define UCOL_TOP_COUNT2 (UCOL_PROPORTION2*UCOL_TOTAL2) 1.722 +#define UCOL_BOT_COUNT2 (UCOL_TOTAL2-UCOL_TOP_COUNT2) 1.723 + 1.724 + 1.725 +#define UCOL_COMMON2 UCOL_COMMON_BOT2 1.726 +#define UCOL_COMMON3_UPPERFIRST 0xC5 1.727 +#define UCOL_COMMON3_NORMAL UCOL_COMMON_BOT3 1.728 + 1.729 +#define UCOL_COMMON4 0xFF 1.730 + 1.731 +/* constants for case level/case first handling */ 1.732 +/* used to instantiate UCollators fields in ucol_updateInternalState */ 1.733 +#define UCOL_CASE_SWITCH 0xC0 1.734 +#define UCOL_NO_CASE_SWITCH 0x00 1.735 + 1.736 +#define UCOL_REMOVE_CASE 0x3F 1.737 +#define UCOL_KEEP_CASE 0xFF 1.738 + 1.739 +#define UCOL_CASE_BIT_MASK 0xC0 1.740 + 1.741 +#define UCOL_TERT_CASE_MASK 0xFF 1.742 + 1.743 +#define UCOL_ENDOFLATINONERANGE 0xFF 1.744 +#define UCOL_LATINONETABLELEN (UCOL_ENDOFLATINONERANGE+50) 1.745 +#define UCOL_BAIL_OUT_CE 0xFF000000 1.746 + 1.747 + 1.748 +typedef enum { 1.749 + NOT_FOUND_TAG = 0, 1.750 + EXPANSION_TAG = 1, /* This code point results in an expansion */ 1.751 + CONTRACTION_TAG = 2, /* Start of a contraction */ 1.752 + THAI_TAG = 3, /* Thai character - do the reordering */ 1.753 + CHARSET_TAG = 4, /* Charset processing, not yet implemented */ 1.754 + SURROGATE_TAG = 5, /* Lead surrogate that is tailored and doesn't start a contraction */ 1.755 + HANGUL_SYLLABLE_TAG = 6, /* AC00-D7AF*/ 1.756 + LEAD_SURROGATE_TAG = 7, /* D800-DBFF*/ 1.757 + TRAIL_SURROGATE_TAG = 8, /* DC00-DFFF*/ 1.758 + CJK_IMPLICIT_TAG = 9, /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 1.759 + IMPLICIT_TAG = 10, 1.760 + SPEC_PROC_TAG = 11, 1.761 + /* ICU 2.1 */ 1.762 + LONG_PRIMARY_TAG = 12, /* This is a three byte primary with starting secondaries and tertiaries */ 1.763 + /* It fits in a single 32 bit CE and is used instead of expansion to save */ 1.764 + /* space without affecting the performance (hopefully) */ 1.765 + 1.766 + DIGIT_TAG = 13, /* COllate Digits As Numbers (CODAN) implementation */ 1.767 + 1.768 + CE_TAGS_COUNT 1.769 +} UColCETags; 1.770 + 1.771 +/* 1.772 + ***************************************************************************************** 1.773 + * set to zero 1.774 + * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF 1.775 + ****************************************************************************************** 1.776 + */ 1.777 + 1.778 +typedef struct { 1.779 + uint32_t variableTopValue; 1.780 + /*UColAttributeValue*/ int32_t frenchCollation; 1.781 + /*UColAttributeValue*/ int32_t alternateHandling; /* attribute for handling variable elements*/ 1.782 + /*UColAttributeValue*/ int32_t caseFirst; /* who goes first, lower case or uppercase */ 1.783 + /*UColAttributeValue*/ int32_t caseLevel; /* do we have an extra case level */ 1.784 + /*UColAttributeValue*/ int32_t normalizationMode; /* attribute for normalization */ 1.785 + /*UColAttributeValue*/ int32_t strength; /* attribute for strength */ 1.786 + /*UColAttributeValue*/ int32_t hiraganaQ; /* attribute for special Hiragana */ 1.787 + /*UColAttributeValue*/ int32_t numericCollation; /* attribute for numeric collation */ 1.788 + uint32_t reserved[15]; /* for future use */ 1.789 +} UColOptionSet; 1.790 + 1.791 +typedef struct { 1.792 + uint32_t UCA_FIRST_TERTIARY_IGNORABLE[2]; /*0x00000000*/ 1.793 + uint32_t UCA_LAST_TERTIARY_IGNORABLE[2]; /*0x00000000*/ 1.794 + uint32_t UCA_FIRST_PRIMARY_IGNORABLE[2]; /*0x00008705*/ 1.795 + uint32_t UCA_FIRST_SECONDARY_IGNORABLE[2]; /*0x00000000*/ 1.796 + uint32_t UCA_LAST_SECONDARY_IGNORABLE[2]; /*0x00000500*/ 1.797 + uint32_t UCA_LAST_PRIMARY_IGNORABLE[2]; /*0x0000DD05*/ 1.798 + uint32_t UCA_FIRST_VARIABLE[2]; /*0x05070505*/ 1.799 + uint32_t UCA_LAST_VARIABLE[2]; /*0x13CF0505*/ 1.800 + uint32_t UCA_FIRST_NON_VARIABLE[2]; /*0x16200505*/ 1.801 + uint32_t UCA_LAST_NON_VARIABLE[2]; /*0x767C0505*/ 1.802 + uint32_t UCA_RESET_TOP_VALUE[2]; /*0x9F000303*/ 1.803 + uint32_t UCA_FIRST_IMPLICIT[2]; 1.804 + uint32_t UCA_LAST_IMPLICIT[2]; 1.805 + uint32_t UCA_FIRST_TRAILING[2]; 1.806 + uint32_t UCA_LAST_TRAILING[2]; 1.807 + 1.808 +#if 0 1.809 + uint32_t UCA_NEXT_TOP_VALUE[2]; /*0xE8960303*/ 1.810 + uint32_t UCA_NEXT_FIRST_PRIMARY_IGNORABLE; /*0x00008905*/ 1.811 + uint32_t UCA_NEXT_LAST_PRIMARY_IGNORABLE; /*0x03000303*/ 1.812 + uint32_t UCA_NEXT_FIRST_SECONDARY_IGNORABLE; /*0x00008705*/ 1.813 + uint32_t UCA_NEXT_LAST_SECONDARY_IGNORABLE; /*0x00000500*/ 1.814 + uint32_t UCA_NEXT_FIRST_TERTIARY_IGNORABLE; /*0x00000000*/ 1.815 + uint32_t UCA_NEXT_LAST_TERTIARY_IGNORABLE; /*0x00000000*/ 1.816 + uint32_t UCA_NEXT_FIRST_VARIABLE; /*0x05090505*/ 1.817 + uint32_t UCA_NEXT_LAST_VARIABLE; /*0x16200505*/ 1.818 +#endif 1.819 + 1.820 + uint32_t UCA_PRIMARY_TOP_MIN; 1.821 + uint32_t UCA_PRIMARY_IMPLICIT_MIN; /*0xE8000000*/ 1.822 + uint32_t UCA_PRIMARY_IMPLICIT_MAX; /*0xF0000000*/ 1.823 + uint32_t UCA_PRIMARY_TRAILING_MIN; /*0xE8000000*/ 1.824 + uint32_t UCA_PRIMARY_TRAILING_MAX; /*0xF0000000*/ 1.825 + uint32_t UCA_PRIMARY_SPECIAL_MIN; /*0xE8000000*/ 1.826 + uint32_t UCA_PRIMARY_SPECIAL_MAX; /*0xF0000000*/ 1.827 +} UCAConstants; 1.828 + 1.829 +/* definition of UCATableHeader moved to common/ucol_data.h */ 1.830 + 1.831 +#define U_UNKNOWN_STATE 0 1.832 +#define U_COLLATOR_STATE 0x01 1.833 +#define U_STATE_LIMIT 0x02 1.834 + 1.835 +/* This is the first structure in a state */ 1.836 +/* it should be machine independent */ 1.837 +typedef struct { 1.838 + /* this structure is supposed to be readable on all the platforms.*/ 1.839 + /* first 2 fields hold the size of the structure in a platform independent way */ 1.840 + uint8_t sizeLo; 1.841 + uint8_t sizeHi; 1.842 + /* identifying the writing platform */ 1.843 + uint8_t isBigEndian; 1.844 + /* see U_CHARSET_FAMILY values in utypes.h */ 1.845 + uint8_t charsetFamily; 1.846 + /* version of ICU this state structure comes from */ 1.847 + uint8_t icuVersion[4]; 1.848 + /* What is the data following this state */ 1.849 + uint8_t type; 1.850 + /* more stuff to come, keep it on 16 byte boundary */ 1.851 + uint8_t reserved[7]; 1.852 +} UStateStruct; 1.853 + 1.854 +/* This structure follows UStatusStruct */ 1.855 +/* and contains data specific for the collators */ 1.856 +/* Endianess needs to be decided before accessing this structure */ 1.857 +/* However, it's size IS endianess independent */ 1.858 +typedef struct { 1.859 + /* size of this structure */ 1.860 + uint8_t sizeLo; 1.861 + uint8_t sizeHi; 1.862 + /* This state is followed by the frozen tailoring */ 1.863 + uint8_t containsTailoring; 1.864 + /* This state is followed by the frozen UCA */ 1.865 + uint8_t containsUCA; 1.866 + /* Version info - the same one */ 1.867 + uint8_t versionInfo[4]; 1.868 + 1.869 + /* for charset CEs */ 1.870 + uint8_t charsetName[32]; 1.871 + /* this is the resolved locale name*/ 1.872 + uint8_t locale[32]; 1.873 + 1.874 + /* Attributes. Open ended */ 1.875 + /* all the following will be moved to uint32_t because of portability */ 1.876 + /* variable top value */ 1.877 + uint32_t variableTopValue; 1.878 + /* attribute for handling variable elements*/ 1.879 + uint32_t /*UColAttributeValue*/ alternateHandling; 1.880 + /* how to handle secondary weights */ 1.881 + uint32_t /*UColAttributeValue*/ frenchCollation; 1.882 + /* who goes first, lower case or uppercase */ 1.883 + uint32_t /*UColAttributeValue*/ caseFirst; 1.884 + /* do we have an extra case level */ 1.885 + uint32_t /*UColAttributeValue*/ caseLevel; 1.886 + /* attribute for normalization */ 1.887 + uint32_t /*UColAttributeValue*/ normalizationMode; 1.888 + /* attribute for strength */ 1.889 + uint32_t /*UColAttributeValue*/ strength; 1.890 + /* to be immediately 16 byte aligned */ 1.891 + uint8_t reserved[12]; 1.892 +} UColStateStruct; 1.893 + 1.894 +#define UCOL_INV_SIZEMASK 0xFFF00000 1.895 +#define UCOL_INV_OFFSETMASK 0x000FFFFF 1.896 +#define UCOL_INV_SHIFTVALUE 20 1.897 + 1.898 +U_CDECL_BEGIN 1.899 + 1.900 +/* definition of InverseUCATableHeader moved to common/ucol_data.h */ 1.901 + 1.902 +typedef void U_CALLCONV 1.903 +ResourceCleaner(UCollator *coll); 1.904 + 1.905 + 1.906 +struct UCollator { 1.907 + UColOptionSet *options; 1.908 + SortKeyGenerator *sortKeyGen; 1.909 + uint32_t *latinOneCEs; 1.910 + char* actualLocale; 1.911 + char* validLocale; 1.912 + char* requestedLocale; 1.913 + const UChar *rules; 1.914 + const UChar *ucaRules; 1.915 + const UCollator *UCA; 1.916 + const UCATableHeader *image; 1.917 + UTrie mapping; 1.918 + const uint32_t *latinOneMapping; 1.919 + const uint32_t *expansion; 1.920 + const UChar *contractionIndex; 1.921 + const uint32_t *contractionCEs; 1.922 + 1.923 + const uint32_t *endExpansionCE; /* array of last ces in an expansion ce. 1.924 + corresponds to expansionCESize */ 1.925 + const uint32_t *lastEndExpansionCE;/* pointer to the last element in endExpansionCE */ 1.926 + const uint8_t *expansionCESize; /* array of the maximum size of a 1.927 + expansion ce with the last ce 1.928 + corresponding to endExpansionCE, 1.929 + terminated with a null */ 1.930 + const uint8_t *unsafeCP; /* unsafe code points hashtable */ 1.931 + const uint8_t *contrEndCP; /* Contraction ending chars hash table */ 1.932 + UChar minUnsafeCP; /* Smallest unsafe Code Point. */ 1.933 + UChar minContrEndCP; /* Smallest code point at end of a contraction */ 1.934 + 1.935 + int32_t rulesLength; 1.936 + int32_t latinOneTableLen; 1.937 + 1.938 + uint32_t variableTopValue; 1.939 + UColAttributeValue frenchCollation; 1.940 + UColAttributeValue alternateHandling; /* attribute for handling variable elements*/ 1.941 + UColAttributeValue caseFirst; /* who goes first, lower case or uppercase */ 1.942 + UColAttributeValue caseLevel; /* do we have an extra case level */ 1.943 + UColAttributeValue normalizationMode; /* attribute for normalization */ 1.944 + UColAttributeValue strength; /* attribute for strength */ 1.945 + UColAttributeValue hiraganaQ; /* attribute for Hiragana */ 1.946 + UColAttributeValue numericCollation; 1.947 + UBool variableTopValueisDefault; 1.948 + UBool frenchCollationisDefault; 1.949 + UBool alternateHandlingisDefault; /* attribute for handling variable elements*/ 1.950 + UBool caseFirstisDefault; /* who goes first, lower case or uppercase */ 1.951 + UBool caseLevelisDefault; /* do we have an extra case level */ 1.952 + UBool normalizationModeisDefault; /* attribute for normalization */ 1.953 + UBool strengthisDefault; /* attribute for strength */ 1.954 + UBool hiraganaQisDefault; /* attribute for Hiragana */ 1.955 + UBool numericCollationisDefault; 1.956 + UBool hasRealData; /* some collators have only options, like French, no rules */ 1.957 + /* to speed up things, we use the UCA image, but we don't want it */ 1.958 + /* to run around */ 1.959 + 1.960 + UBool freeOnClose; 1.961 + UBool freeOptionsOnClose; 1.962 + UBool freeRulesOnClose; 1.963 + UBool freeImageOnClose; 1.964 + UBool freeDefaultReorderCodesOnClose; 1.965 + UBool freeReorderCodesOnClose; 1.966 + UBool freeLeadBytePermutationTableOnClose; 1.967 + 1.968 + UBool latinOneUse; 1.969 + UBool latinOneRegenTable; 1.970 + UBool latinOneFailed; 1.971 + 1.972 + int8_t tertiaryAddition; /* when switching case, we need to add or subtract different values */ 1.973 + uint8_t caseSwitch; 1.974 + uint8_t tertiaryCommon; 1.975 + uint8_t tertiaryMask; 1.976 + uint8_t tertiaryTop; /* Upper range when compressing */ 1.977 + uint8_t tertiaryBottom; /* Upper range when compressing */ 1.978 + uint8_t tertiaryTopCount; 1.979 + uint8_t tertiaryBottomCount; 1.980 + 1.981 + UVersionInfo dataVersion; /* Data info of UCA table */ 1.982 + int32_t* defaultReorderCodes; 1.983 + int32_t defaultReorderCodesLength; 1.984 + int32_t* reorderCodes; 1.985 + int32_t reorderCodesLength; 1.986 + uint8_t* leadBytePermutationTable; 1.987 + void *delegate; /* if non-null: C++ object to delegate all API calls to. */ 1.988 +}; 1.989 + 1.990 +U_CDECL_END 1.991 + 1.992 +/* various internal functions */ 1.993 + 1.994 +/* do not close UCA returned by ucol_initUCA! */ 1.995 +U_CFUNC 1.996 +UCollator* ucol_initUCA(UErrorCode *status); 1.997 + 1.998 +U_CFUNC 1.999 +UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status); 1.1000 + 1.1001 +U_CFUNC 1.1002 +void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status); 1.1003 + 1.1004 +U_CFUNC 1.1005 +UCollator* ucol_open_internal(const char* loc, UErrorCode* status); 1.1006 + 1.1007 +#if 0 1.1008 +U_CFUNC 1.1009 +void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status); 1.1010 +#endif 1.1011 + 1.1012 +U_CFUNC 1.1013 +void ucol_updateInternalState(UCollator *coll, UErrorCode *status); 1.1014 + 1.1015 +U_CFUNC uint32_t U_EXPORT2 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status); 1.1016 +U_CAPI UBool U_EXPORT2 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status); 1.1017 + 1.1018 +U_CAPI const InverseUCATableHeader* U_EXPORT2 ucol_initInverseUCA(UErrorCode *status); 1.1019 + 1.1020 +U_CAPI void U_EXPORT2 1.1021 +uprv_uca_initImplicitConstants(UErrorCode *status); 1.1022 + 1.1023 +U_CAPI uint32_t U_EXPORT2 1.1024 +uprv_uca_getImplicitFromRaw(UChar32 cp); 1.1025 + 1.1026 +/*U_CFUNC uint32_t U_EXPORT2 1.1027 +uprv_uca_getImplicitPrimary(UChar32 cp);*/ 1.1028 + 1.1029 +U_CAPI UChar32 U_EXPORT2 1.1030 +uprv_uca_getRawFromImplicit(uint32_t implicit); 1.1031 + 1.1032 +U_CAPI UChar32 U_EXPORT2 1.1033 +uprv_uca_getRawFromCodePoint(UChar32 i); 1.1034 + 1.1035 +U_CAPI UChar32 U_EXPORT2 1.1036 +uprv_uca_getCodePointFromRaw(UChar32 i); 1.1037 + 1.1038 +typedef const UChar* GetCollationRulesFunction(void* context, const char* locale, const char* type, int32_t* pLength, UErrorCode* status); 1.1039 + 1.1040 +U_CAPI UCollator* U_EXPORT2 1.1041 +ucol_openRulesForImport( const UChar *rules, 1.1042 + int32_t rulesLength, 1.1043 + UColAttributeValue normalizationMode, 1.1044 + UCollationStrength strength, 1.1045 + UParseError *parseError, 1.1046 + GetCollationRulesFunction importFunc, 1.1047 + void* context, 1.1048 + UErrorCode *status); 1.1049 + 1.1050 + 1.1051 +U_CFUNC void U_EXPORT2 1.1052 +ucol_buildPermutationTable(UCollator *coll, UErrorCode *status); 1.1053 + 1.1054 +U_CFUNC int U_EXPORT2 1.1055 +ucol_getLeadBytesForReorderCode(const UCollator *uca, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity); 1.1056 + 1.1057 +U_CFUNC int U_EXPORT2 1.1058 +ucol_getReorderCodesForLeadByte(const UCollator *uca, int leadByte, int16_t* returnReorderCodes, int returnCapacity); 1.1059 + 1.1060 +#ifdef __cplusplus 1.1061 +/* 1.1062 + * Test whether a character is potentially "unsafe" for use as a collation 1.1063 + * starting point. Unsafe chars are those with combining class != 0 plus 1.1064 + * those that are the 2nd thru nth character in a contraction sequence. 1.1065 + * 1.1066 + * Function is in header file because it's used in both collation and string search, 1.1067 + * and needs to be inline for performance. 1.1068 + */ 1.1069 +static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) { 1.1070 + int32_t hash; 1.1071 + uint8_t htbyte; 1.1072 + 1.1073 + if (c < coll->minUnsafeCP) { 1.1074 + return FALSE; 1.1075 + } 1.1076 + 1.1077 + hash = c; 1.1078 + if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 1.1079 + if(U16_IS_SURROGATE(c)) { 1.1080 + /* Lead or trail surrogate */ 1.1081 + /* These are always considered unsafe. */ 1.1082 + return TRUE; 1.1083 + } 1.1084 + hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 1.1085 + } 1.1086 + htbyte = coll->unsafeCP[hash>>3]; 1.1087 + return ((htbyte >> (hash & 7)) & 1); 1.1088 +} 1.1089 +#endif /* __cplusplus */ 1.1090 + 1.1091 +/* The offsetBuffer in collIterate might need to be freed to avoid memory leaks. */ 1.1092 +void ucol_freeOffsetBuffer(U_NAMESPACE_QUALIFIER collIterate *s); 1.1093 + 1.1094 +#endif /* #if !UCONFIG_NO_COLLATION */ 1.1095 + 1.1096 +#endif