|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1998-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * |
|
9 * Private implementation header for C collation |
|
10 * file name: ucol_imp.h |
|
11 * encoding: US-ASCII |
|
12 * tab size: 8 (not used) |
|
13 * indentation:4 |
|
14 * |
|
15 * created on: 2000dec11 |
|
16 * created by: Vladimir Weinstein |
|
17 * |
|
18 * Modification history |
|
19 * Date Name Comments |
|
20 * 02/16/2001 synwee Added UCOL_GETPREVCE for the use in ucoleitr |
|
21 * 02/27/2001 synwee Added getMaxExpansion data structure in UCollator |
|
22 * 03/02/2001 synwee Added UCOL_IMPLICIT_CE |
|
23 * 03/12/2001 synwee Added pointer start to collIterate. |
|
24 */ |
|
25 |
|
26 #ifndef UCOL_IMP_H |
|
27 #define UCOL_IMP_H |
|
28 |
|
29 #include "unicode/utypes.h" |
|
30 #ifdef __cplusplus |
|
31 # include "unicode/utf16.h" |
|
32 #endif |
|
33 |
|
34 #define UCA_DATA_TYPE "icu" |
|
35 #define UCA_DATA_NAME "ucadata" |
|
36 #define INVC_DATA_TYPE "icu" |
|
37 #define INVC_DATA_NAME "invuca" |
|
38 |
|
39 /** |
|
40 * Convenience string denoting the Collation data tree |
|
41 * @internal ICU 3.0 |
|
42 */ |
|
43 #define U_ICUDATA_COLL U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll" |
|
44 |
|
45 #if !UCONFIG_NO_COLLATION |
|
46 |
|
47 #ifdef __cplusplus |
|
48 #include "unicode/normalizer2.h" |
|
49 #include "unicode/unistr.h" |
|
50 #endif |
|
51 #include "unicode/ucol.h" |
|
52 #include "ucol_data.h" |
|
53 #include "utrie.h" |
|
54 #include "cmemory.h" |
|
55 |
|
56 /* This is the internal header file which contains important declarations for |
|
57 * the collation framework. |
|
58 * Ready to use collators are stored as binary images. Both UCA and tailorings |
|
59 * share the same binary format. Individual files (currently only UCA) have a |
|
60 * udata header in front of the image and should be opened using udata_open. |
|
61 * Tailoring images are currently stored inside resource bundles and are intialized |
|
62 * through ucol_open API. |
|
63 * |
|
64 * The following describes the formats for collation binaries |
|
65 * (UCA & tailorings) and for the inverse UCA table. |
|
66 * Substructures are described in the collation design document at |
|
67 * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm |
|
68 * |
|
69 * ------------------------------------------------------------- |
|
70 * |
|
71 * Here is the format of binary collation image. |
|
72 * |
|
73 * Physical order of structures: |
|
74 * - header (UCATableHeader) |
|
75 * - options (UColOptionSet) |
|
76 * - expansions (CE[]) |
|
77 * - contractions (UChar[contractionSize] + CE[contractionSize]) |
|
78 * - serialized UTrie with mappings of code points to CEs |
|
79 * - max expansion tables (CE[endExpansionCECount] + uint8_t[endExpansionCECount]) |
|
80 * - two bit sets for backward processing in strcoll (identical prefixes) |
|
81 * and for backward CE iteration (each set is uint8_t[UCOL_UNSAFECP_TABLE_SIZE]) |
|
82 * - UCA constants (UCAConstants) |
|
83 * - UCA contractions (UChar[contractionUCACombosSize][contractionUCACombosWidth]) |
|
84 * |
|
85 * UCATableHeader fields: |
|
86 * |
|
87 * int32_t size; - image size in bytes |
|
88 * |
|
89 * Offsets to interesting data. All offsets are in bytes. |
|
90 * to get the address add to the header address and cast properly. |
|
91 * Some offsets are zero if the corresponding structures are empty. |
|
92 * |
|
93 * Tailoring binaries that only set options and contain no mappings etc. |
|
94 * will have all offsets 0 except for the options and expansion offsets, |
|
95 * which give the position and length of the options array. |
|
96 * |
|
97 * uint32_t options; - offset to default collator options (UColOptionSet *), |
|
98 * a set of 32-bit values. See declaration of UColOptionSet for more details |
|
99 * |
|
100 * uint32_t UCAConsts; - only used (!=0) in UCA image - structure which holds values for indirect positioning and implicit ranges |
|
101 * See declaration of UCAConstants structure. This is a set of unsigned 32-bit values used to store |
|
102 * important constant values that are defined in the UCA and used for building and runtime. |
|
103 * |
|
104 * uint32_t contractionUCACombos; - only used (!=0) in UCA image - list of UCA contractions. This is a zero terminated array of UChar[contractionUCACombosWidth], |
|
105 * containing contractions from the UCA. These are needed in the build process to copy UCA contractions |
|
106 * in case the base contraction symbol is tailored. |
|
107 * |
|
108 * uint32_t magic; - must contain UCOL_HEADER_MAGIC (formatVersion 2.3) |
|
109 * |
|
110 * uint32_t mappingPosition; - offset to UTrie (const uint8_t *mappingPosition). This is a serialized UTrie and should be treated as such. |
|
111 * Used as a primary lookup table for collation elements. |
|
112 * |
|
113 * uint32_t expansion; - offset to expansion table (uint32_t *expansion). This is an array of expansion CEs. Never 0. |
|
114 * |
|
115 * uint32_t contractionIndex; - offset to contraction table (UChar *contractionIndex). Used to look up contraction sequences. Contents |
|
116 * are aligned with the contents of contractionCEs table. 0 if no contractions. |
|
117 * |
|
118 * uint32_t contractionCEs; - offset to resulting contraction CEs (uint32_t *contractionCEs). When a contraction is resolved in the |
|
119 * in the contractionIndex table, the resulting index is used to look up corresponding CE in this table. |
|
120 * 0 if no contractions. |
|
121 * uint32_t contractionSize; - size of contraction table in elements (both Index and CEs). |
|
122 * |
|
123 * Tables described below are used for Boyer-Moore searching algorithm - they define the size of longest expansion |
|
124 * and last CEs in expansions. |
|
125 * uint32_t endExpansionCE; - offset to array of last collation element in expansion (uint32_t *). |
|
126 * Never 0. |
|
127 * uint32_t expansionCESize; - array of maximum expansion sizes (uint8_t *) |
|
128 * int32_t endExpansionCECount; - size of endExpansionCE. See UCOL_GETMAXEXPANSION |
|
129 * for the usage model |
|
130 * |
|
131 * These two offsets point to byte tables that are used in the backup heuristics. |
|
132 * uint32_t unsafeCP; - hash table of unsafe code points (uint8_t *). See ucol_unsafeCP function. |
|
133 * uint32_t contrEndCP; - hash table of final code points in contractions (uint8_t *). See ucol_contractionEndCP. |
|
134 * |
|
135 * int32_t contractionUCACombosSize; - number of UChar[contractionUCACombosWidth] in contractionUCACombos |
|
136 * (formatVersion 2.3) |
|
137 * UBool jamoSpecial; - Jamo special indicator (uint8_t). If TRUE, Jamos are special, so we cannot use simple Hangul decomposition. |
|
138 * UBool isBigEndian; - endianness of this collation binary (formatVersion 2.3) |
|
139 * uint8_t charSetFamily; - charset family of this collation binary (formatVersion 2.3) |
|
140 * uint8_t contractionUCACombosWidth; - number of UChars per UCA contraction in contractionUCACombos (formatVersion 2.3) |
|
141 * |
|
142 * Various version fields |
|
143 * UVersionInfo version; - version 4 uint8_t |
|
144 * UVersionInfo UCAVersion; - version 4 uint8_t |
|
145 * UVersionInfo UCDVersion; - version 4 uint8_t |
|
146 * UVersionInfo formatVersion; - version of the format of the collation binary |
|
147 * same formatVersion as in ucadata.icu's UDataInfo header |
|
148 * (formatVersion 2.3) |
|
149 * |
|
150 * uint32_t offset to the reordering code to lead CE byte remapping table |
|
151 * uint32_t offset to the lead CE byte to reordering code mapping table |
|
152 * |
|
153 * uint8_t reserved[76]; - currently unused |
|
154 * |
|
155 * ------------------------------------------------------------- |
|
156 * |
|
157 * Inverse UCA is used for constructing collators from rules. It is always an individual file |
|
158 * and always has a UDataInfo header. |
|
159 * here is the structure: |
|
160 * |
|
161 * uint32_t byteSize; - size of inverse UCA image in bytes |
|
162 * uint32_t tableSize; - length of inverse table (number of uint32_t[3] rows) |
|
163 * uint32_t contsSize; - size of continuation table (number of UChars in table) |
|
164 * |
|
165 * uint32_t table; - offset to inverse table (uint32_t *) |
|
166 * Inverse table contains of rows of 3 uint32_t values. First two values are CE and a possible continuation |
|
167 * the third value is either a code unit (if there is only one code unit for element) or an index to continuation |
|
168 * (number of code units combined with an index). |
|
169 * table. If more than one codepoint have the same CE, continuation table contains code units separated by FFFF and final |
|
170 * code unit sequence for a CE is terminated by FFFE. |
|
171 * uint32_t conts; - offset to continuation table (uint16_t *). Contains code units that transform to a same CE. |
|
172 * |
|
173 * UVersionInfo UCAVersion; - version of the UCA, read from file 4 uint8_t |
|
174 * uint8_t padding[8]; - padding 8 uint8_t |
|
175 * Header is followed by the table and continuation table. |
|
176 */ |
|
177 |
|
178 /* definition of UCOL_HEADER_MAGIC moved to common/ucol_data.h */ |
|
179 |
|
180 /* UDataInfo for UCA mapping table */ |
|
181 /* dataFormat="UCol" */ |
|
182 #define UCA_DATA_FORMAT_0 ((uint8_t)0x55) |
|
183 #define UCA_DATA_FORMAT_1 ((uint8_t)0x43) |
|
184 #define UCA_DATA_FORMAT_2 ((uint8_t)0x6f) |
|
185 #define UCA_DATA_FORMAT_3 ((uint8_t)0x6c) |
|
186 |
|
187 #define UCA_FORMAT_VERSION_0 ((uint8_t)3) |
|
188 #define UCA_FORMAT_VERSION_1 0 |
|
189 #define UCA_FORMAT_VERSION_2 ((uint8_t)0) |
|
190 #define UCA_FORMAT_VERSION_3 ((uint8_t)0) |
|
191 |
|
192 /* UDataInfo for inverse UCA table */ |
|
193 /* dataFormat="InvC" */ |
|
194 #define INVUCA_DATA_FORMAT_0 ((uint8_t)0x49) |
|
195 #define INVUCA_DATA_FORMAT_1 ((uint8_t)0x6E) |
|
196 #define INVUCA_DATA_FORMAT_2 ((uint8_t)0x76) |
|
197 #define INVUCA_DATA_FORMAT_3 ((uint8_t)0x43) |
|
198 |
|
199 #define INVUCA_FORMAT_VERSION_0 ((uint8_t)2) |
|
200 #define INVUCA_FORMAT_VERSION_1 ((uint8_t)1) |
|
201 #define INVUCA_FORMAT_VERSION_2 ((uint8_t)0) |
|
202 #define INVUCA_FORMAT_VERSION_3 ((uint8_t)0) |
|
203 |
|
204 /* This is the size of the stack allocated buffer for sortkey generation and similar operations */ |
|
205 /* if it is too small, heap allocation will occur.*/ |
|
206 /* you can change this value if you need memory - it will affect the performance, though, since we're going to malloc */ |
|
207 #define UCOL_MAX_BUFFER 128 |
|
208 |
|
209 #define UCOL_NORMALIZATION_GROWTH 2 |
|
210 #define UCOL_NORMALIZATION_MAX_BUFFER UCOL_MAX_BUFFER*UCOL_NORMALIZATION_GROWTH |
|
211 |
|
212 /* This writable buffer is used if we encounter Thai and need to reorder the string on the fly */ |
|
213 /* Sometimes we already have a writable buffer (like in case of normalized strings). */ |
|
214 /* |
|
215 you can change this value to any value >= 4 if you need memory - |
|
216 it will affect the performance, though, since we're going to malloc. |
|
217 Note 3 is the minimum value for Thai collation and 4 is the |
|
218 minimum number for special Jamo |
|
219 */ |
|
220 #define UCOL_WRITABLE_BUFFER_SIZE 256 |
|
221 |
|
222 /* This is the size of the buffer for expansion CE's */ |
|
223 /* In reality we should not have to deal with expm sequences longer then 16 */ |
|
224 /* you can change this value if you need memory */ |
|
225 /* WARNING THIS BUFFER DOES HAVE MALLOC FALLBACK. If you make it too small, you'll get into performance trouble */ |
|
226 /* Reasonable small value is around 10, if you don't do Arabic or other funky collations that have long expansion sequence */ |
|
227 /* This is the longest expansion sequence we can handle without bombing out */ |
|
228 #define UCOL_EXPAND_CE_BUFFER_SIZE 64 |
|
229 |
|
230 /* This is the size to increase the buffer for expansion CE's */ |
|
231 #define UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE 64 |
|
232 |
|
233 |
|
234 /* Unsafe UChar hash table table size. */ |
|
235 /* size is 32 bytes for 1 bit for each latin 1 char + some power of two for */ |
|
236 /* hashing the rest of the chars. Size in bytes */ |
|
237 #define UCOL_UNSAFECP_TABLE_SIZE 1056 |
|
238 /* mask value down to "some power of two"-1 */ |
|
239 /* number of bits, not num of bytes. */ |
|
240 #define UCOL_UNSAFECP_TABLE_MASK 0x1fff |
|
241 |
|
242 |
|
243 /* flags bits for collIterate.flags */ |
|
244 /* */ |
|
245 /* NORM - set for incremental normalize of source string */ |
|
246 #define UCOL_ITER_NORM 1 |
|
247 |
|
248 #define UCOL_ITER_HASLEN 2 |
|
249 |
|
250 /* UCOL_ITER_INNORMBUF - set if the "pos" is in */ |
|
251 /* the writable side buffer, handling */ |
|
252 /* incrementally normalized characters. */ |
|
253 #define UCOL_ITER_INNORMBUF 4 |
|
254 |
|
255 /* UCOL_ITER_ALLOCATED - set if this iterator has */ |
|
256 /* malloced storage to expand a buffer. */ |
|
257 #define UCOL_ITER_ALLOCATED 8 |
|
258 /* UCOL_HIRAGANA_Q - note if the codepoint was hiragana */ |
|
259 #define UCOL_HIRAGANA_Q 16 |
|
260 /* UCOL_WAS_HIRAGANA - set to TRUE if there was a Hiragana */ |
|
261 /* otherwise set to false */ |
|
262 #define UCOL_WAS_HIRAGANA 32 |
|
263 /* UCOL_USE_ITERATOR - set this if collIterate uses a */ |
|
264 /* character iterator instead of simply accessing string */ |
|
265 /* by index */ |
|
266 #define UCOL_USE_ITERATOR 64 |
|
267 |
|
268 #define UCOL_FORCE_HAN_IMPLICIT 128 |
|
269 |
|
270 #define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 |
|
271 |
|
272 #ifdef __cplusplus |
|
273 |
|
274 U_NAMESPACE_BEGIN |
|
275 |
|
276 typedef struct collIterate : public UMemory { |
|
277 const UChar *string; /* Original string */ |
|
278 /* UChar *start; Pointer to the start of the source string. Either points to string |
|
279 or to writableBuffer */ |
|
280 const UChar *endp; /* string end ptr. Is undefined for null terminated strings */ |
|
281 const UChar *pos; /* This is position in the string. Can be to original or writable buf */ |
|
282 |
|
283 uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */ |
|
284 uint32_t *CEpos; /* This is the position to which we have stored processed CEs */ |
|
285 |
|
286 int32_t *offsetReturn; /* This is the offset to return, if non-NULL */ |
|
287 int32_t *offsetStore; /* This is the pointer for storing offsets */ |
|
288 int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */ |
|
289 int32_t offsetRepeatValue; /* offset value to repeat */ |
|
290 |
|
291 UnicodeString writableBuffer; |
|
292 const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */ |
|
293 const UCollator *coll; |
|
294 const Normalizer2 *nfd; |
|
295 uint8_t flags; |
|
296 uint8_t origFlags; |
|
297 uint32_t *extendCEs; /* This is use if CEs is not big enough */ |
|
298 int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */ |
|
299 uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */ |
|
300 |
|
301 int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */ |
|
302 int32_t offsetBufferSize; /* The size of the offset buffer */ |
|
303 |
|
304 UCharIterator *iterator; |
|
305 /*int32_t iteratorIndex;*/ |
|
306 |
|
307 // The offsetBuffer should probably be a UVector32, but helper functions |
|
308 // are an improvement over duplicated code. |
|
309 void appendOffset(int32_t offset, UErrorCode &errorCode); |
|
310 } collIterate; |
|
311 |
|
312 U_NAMESPACE_END |
|
313 |
|
314 #else |
|
315 |
|
316 typedef struct collIterate collIterate; |
|
317 |
|
318 #endif |
|
319 |
|
320 #define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0)) |
|
321 #define headersize (paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))) |
|
322 |
|
323 /* |
|
324 struct used internally in getSpecial*CE. |
|
325 data similar to collIterate. |
|
326 */ |
|
327 struct collIterateState { |
|
328 const UChar *pos; /* This is position in the string. Can be to original or writable buf */ |
|
329 const UChar *returnPos; |
|
330 const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */ |
|
331 const UChar *bufferaddress; /* address of the normalization buffer */ |
|
332 int32_t buffersize; |
|
333 uint8_t flags; |
|
334 uint8_t origFlags; |
|
335 uint32_t iteratorIndex; |
|
336 int32_t iteratorMove; |
|
337 }; |
|
338 |
|
339 U_CAPI void U_EXPORT2 |
|
340 uprv_init_collIterate(const UCollator *collator, |
|
341 const UChar *sourceString, int32_t sourceLen, |
|
342 U_NAMESPACE_QUALIFIER collIterate *s, UErrorCode *status); |
|
343 |
|
344 /* Internal functions for C test code. */ |
|
345 U_CAPI U_NAMESPACE_QUALIFIER collIterate * U_EXPORT2 |
|
346 uprv_new_collIterate(UErrorCode *status); |
|
347 |
|
348 U_CAPI void U_EXPORT2 |
|
349 uprv_delete_collIterate(U_NAMESPACE_QUALIFIER collIterate *s); |
|
350 |
|
351 /* @return s->pos == s->endp */ |
|
352 U_CAPI UBool U_EXPORT2 |
|
353 uprv_collIterateAtEnd(U_NAMESPACE_QUALIFIER collIterate *s); |
|
354 |
|
355 #ifdef __cplusplus |
|
356 |
|
357 U_NAMESPACE_BEGIN |
|
358 |
|
359 struct UCollationPCE; |
|
360 typedef struct UCollationPCE UCollationPCE; |
|
361 |
|
362 U_NAMESPACE_END |
|
363 |
|
364 struct UCollationElements : public icu::UMemory |
|
365 { |
|
366 /** |
|
367 * Struct wrapper for source data |
|
368 */ |
|
369 icu::collIterate iteratordata_; |
|
370 /** |
|
371 * Indicates if this data has been reset. |
|
372 */ |
|
373 UBool reset_; |
|
374 /** |
|
375 * Indicates if the data should be deleted. |
|
376 */ |
|
377 UBool isWritable; |
|
378 |
|
379 /** |
|
380 * Data for getNextProcessed, getPreviousProcessed. |
|
381 */ |
|
382 icu::UCollationPCE *pce; |
|
383 }; |
|
384 |
|
385 #else |
|
386 /*opaque type*/ |
|
387 struct UCollationElements; |
|
388 #endif |
|
389 |
|
390 U_CAPI void U_EXPORT2 |
|
391 uprv_init_pce(const struct UCollationElements *elems); |
|
392 |
|
393 #define UCOL_LEVELTERMINATOR 1 |
|
394 |
|
395 /* mask off anything but primary order */ |
|
396 #define UCOL_PRIMARYORDERMASK 0xffff0000 |
|
397 /* mask off anything but secondary order */ |
|
398 #define UCOL_SECONDARYORDERMASK 0x0000ff00 |
|
399 /* mask off anything but tertiary order */ |
|
400 #define UCOL_TERTIARYORDERMASK 0x000000ff |
|
401 /* primary order shift */ |
|
402 #define UCOL_PRIMARYORDERSHIFT 16 |
|
403 /* secondary order shift */ |
|
404 #define UCOL_SECONDARYORDERSHIFT 8 |
|
405 |
|
406 #define UCOL_BYTE_SIZE_MASK 0xFF |
|
407 |
|
408 #define UCOL_CASE_BYTE_START 0x80 |
|
409 #define UCOL_CASE_SHIFT_START 7 |
|
410 |
|
411 #define UCOL_IGNORABLE 0 |
|
412 |
|
413 /* get weights from a CE */ |
|
414 #define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT) |
|
415 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) |
|
416 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) |
|
417 |
|
418 /** |
|
419 * Determine if a character is a Thai vowel (which sorts after |
|
420 * its base consonant). |
|
421 */ |
|
422 #define UCOL_ISTHAIPREVOWEL(ch) ((((uint32_t)(ch) - 0xe40) <= (0xe44 - 0xe40)) || \ |
|
423 (((uint32_t)(ch) - 0xec0) <= (0xec4 - 0xec0))) |
|
424 |
|
425 /** |
|
426 * Determine if a character is a Thai base consonant |
|
427 */ |
|
428 #define UCOL_ISTHAIBASECONSONANT(ch) ((uint32_t)(ch) - 0xe01) <= (0xe2e - 0xe01) |
|
429 |
|
430 #define UCOL_ISJAMO(ch) ((((uint32_t)(ch) - 0x1100) <= (0x1112 - 0x1100)) || \ |
|
431 (((uint32_t)(ch) - 0x1161) <= (0x1175 - 0x1161)) || \ |
|
432 (((uint32_t)(ch) - 0x11A8) <= (0x11C2 - 0x11A8))) |
|
433 |
|
434 /* Han character ranges */ |
|
435 #define UCOL_FIRST_HAN 0x4E00 |
|
436 #define UCOL_LAST_HAN 0x9FFF |
|
437 #define UCOL_FIRST_HAN_A 0x3400 |
|
438 #define UCOL_LAST_HAN_A 0x4DBF |
|
439 #define UCOL_FIRST_HAN_COMPAT 0xFAE0 |
|
440 #define UCOL_LAST_HAN_COMPAT 0xFA2F |
|
441 |
|
442 /* Han extension B is in plane 2 */ |
|
443 #define UCOL_FIRST_HAN_B 0x20000 |
|
444 #define UCOL_LAST_HAN_B 0x2A6DF |
|
445 |
|
446 /* Hangul range */ |
|
447 #define UCOL_FIRST_HANGUL 0xAC00 |
|
448 #define UCOL_LAST_HANGUL 0xD7AF |
|
449 |
|
450 /* Jamo ranges */ |
|
451 #define UCOL_FIRST_L_JAMO 0x1100 |
|
452 #define UCOL_FIRST_V_JAMO 0x1161 |
|
453 #define UCOL_FIRST_T_JAMO 0x11A8 |
|
454 #define UCOL_LAST_T_JAMO 0x11F9 |
|
455 |
|
456 |
|
457 #if 0 |
|
458 /* initializes collIterate structure */ |
|
459 /* made as macro to speed up things */ |
|
460 #define init_collIterate(collator, sourceString, sourceLen, s) { \ |
|
461 (s)->start = (s)->string = (s)->pos = (UChar *)(sourceString); \ |
|
462 (s)->endp = (sourceLen) == -1 ? NULL :(UChar *)(sourceString)+(sourceLen); \ |
|
463 (s)->CEpos = (s)->toReturn = (s)->CEs; \ |
|
464 (s)->isThai = TRUE; \ |
|
465 (s)->writableBuffer = (s)->stackWritableBuffer; \ |
|
466 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; \ |
|
467 (s)->coll = (collator); \ |
|
468 (s)->fcdPosition = 0; \ |
|
469 (s)->flags = 0; \ |
|
470 if(((collator)->normalizationMode == UCOL_ON)) (s)->flags |= UCOL_ITER_NORM; \ |
|
471 } |
|
472 #endif |
|
473 |
|
474 |
|
475 |
|
476 /* |
|
477 * Macro to get the maximum size of an expansion ending with the argument ce. |
|
478 * Used in the Boyer Moore algorithm. |
|
479 * Note for tailoring, the UCA maxexpansion table has been merged. |
|
480 * Hence we only have to search the tailored collator only. |
|
481 * @param coll const UCollator pointer |
|
482 * @param order last collation element of the expansion sequence |
|
483 * @param result size of the longest expansion with argument collation element |
|
484 * as the last element |
|
485 */ |
|
486 #define UCOL_GETMAXEXPANSION(coll, order, result) { \ |
|
487 const uint32_t *start; \ |
|
488 const uint32_t *limit; \ |
|
489 const uint32_t *mid; \ |
|
490 start = (coll)->endExpansionCE; \ |
|
491 limit = (coll)->lastEndExpansionCE; \ |
|
492 while (start < limit - 1) { \ |
|
493 mid = start + ((limit - start) >> 1); \ |
|
494 if ((order) <= *mid) { \ |
|
495 limit = mid; \ |
|
496 } \ |
|
497 else { \ |
|
498 start = mid; \ |
|
499 } \ |
|
500 } \ |
|
501 if (*start == order) { \ |
|
502 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); \ |
|
503 } \ |
|
504 else if (*limit == order) { \ |
|
505 result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); \ |
|
506 } \ |
|
507 else if ((order & 0xFFFF) == 0x00C0) { \ |
|
508 result = 2; \ |
|
509 } \ |
|
510 else { \ |
|
511 result = 1; \ |
|
512 } \ |
|
513 } |
|
514 |
|
515 U_CFUNC |
|
516 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, |
|
517 U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status); |
|
518 |
|
519 U_CFUNC |
|
520 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, |
|
521 U_NAMESPACE_QUALIFIER collIterate *source, UErrorCode *status); |
|
522 U_CAPI uint32_t U_EXPORT2 ucol_getNextCE(const UCollator *coll, |
|
523 U_NAMESPACE_QUALIFIER collIterate *collationSource, UErrorCode *status); |
|
524 U_CFUNC uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll, |
|
525 U_NAMESPACE_QUALIFIER collIterate *collationSource, |
|
526 UErrorCode *status); |
|
527 /* get some memory */ |
|
528 void *ucol_getABuffer(const UCollator *coll, uint32_t size); |
|
529 |
|
530 #ifdef __cplusplus |
|
531 |
|
532 U_NAMESPACE_BEGIN |
|
533 |
|
534 class CollationKey; |
|
535 class SortKeyByteSink; |
|
536 |
|
537 U_NAMESPACE_END |
|
538 |
|
539 /* function used by C++ getCollationKey to prevent restarting the calculation */ |
|
540 U_CFUNC int32_t |
|
541 ucol_getCollationKey(const UCollator *coll, |
|
542 const UChar *source, int32_t sourceLength, |
|
543 icu::CollationKey &key, |
|
544 UErrorCode &errorCode); |
|
545 |
|
546 typedef void U_CALLCONV |
|
547 SortKeyGenerator(const UCollator *coll, |
|
548 const UChar *source, |
|
549 int32_t sourceLength, |
|
550 icu::SortKeyByteSink &result, |
|
551 UErrorCode *status); |
|
552 |
|
553 /* worker function for generating sortkeys */ |
|
554 U_CFUNC |
|
555 void U_CALLCONV |
|
556 ucol_calcSortKey(const UCollator *coll, |
|
557 const UChar *source, |
|
558 int32_t sourceLength, |
|
559 icu::SortKeyByteSink &result, |
|
560 UErrorCode *status); |
|
561 |
|
562 U_CFUNC |
|
563 void U_CALLCONV |
|
564 ucol_calcSortKeySimpleTertiary(const UCollator *coll, |
|
565 const UChar *source, |
|
566 int32_t sourceLength, |
|
567 icu::SortKeyByteSink &result, |
|
568 UErrorCode *status); |
|
569 |
|
570 #else |
|
571 |
|
572 typedef void U_CALLCONV |
|
573 SortKeyGenerator(const UCollator *coll, |
|
574 const UChar *source, |
|
575 int32_t sourceLength, |
|
576 void *result, |
|
577 UErrorCode *status); |
|
578 |
|
579 #endif |
|
580 |
|
581 /** |
|
582 * Used to set requested and valid locales on a collator returned by the collator |
|
583 * service. |
|
584 */ |
|
585 U_CFUNC void U_EXPORT2 |
|
586 ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt, char *actualLocaleToAdopt); |
|
587 |
|
588 #define UCOL_SPECIAL_FLAG 0xF0000000 |
|
589 #define UCOL_TAG_SHIFT 24 |
|
590 #define UCOL_TAG_MASK 0x0F000000 |
|
591 #define INIT_EXP_TABLE_SIZE 1024 |
|
592 #define UCOL_NOT_FOUND 0xF0000000 |
|
593 #define UCOL_EXPANSION 0xF1000000 |
|
594 #define UCOL_CONTRACTION 0xF2000000 |
|
595 #define UCOL_THAI 0xF3000000 |
|
596 #define UCOL_UNMARKED 0x03 |
|
597 #define UCOL_NEW_TERTIARYORDERMASK 0x0000003f |
|
598 |
|
599 /* Bit mask for primary collation strength. */ |
|
600 #define UCOL_PRIMARYMASK 0xFFFF0000 |
|
601 |
|
602 /* Bit mask for secondary collation strength. */ |
|
603 #define UCOL_SECONDARYMASK 0x0000FF00 |
|
604 |
|
605 /* Bit mask for tertiary collation strength. */ |
|
606 #define UCOL_TERTIARYMASK 0x000000FF |
|
607 |
|
608 /** |
|
609 * Internal. |
|
610 * This indicates the last element in a UCollationElements has been consumed. |
|
611 * Compare with the UCOL_NULLORDER, UCOL_NULLORDER is returned if error occurs. |
|
612 */ |
|
613 #define UCOL_NO_MORE_CES 0x00010101 |
|
614 #define UCOL_NO_MORE_CES_PRIMARY 0x00010000 |
|
615 #define UCOL_NO_MORE_CES_SECONDARY 0x00000100 |
|
616 #define UCOL_NO_MORE_CES_TERTIARY 0x00000001 |
|
617 |
|
618 #define isSpecial(CE) ((((CE)&UCOL_SPECIAL_FLAG)>>28)==0xF) |
|
619 |
|
620 #define UCOL_UPPER_CASE 0x80 |
|
621 #define UCOL_MIXED_CASE 0x40 |
|
622 #define UCOL_LOWER_CASE 0x00 |
|
623 |
|
624 #define UCOL_CONTINUATION_MARKER 0xC0 |
|
625 #define UCOL_REMOVE_CONTINUATION 0xFFFFFF3F |
|
626 |
|
627 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) |
|
628 #define isFlagged(CE) (((CE) & 0x80) == 0x80) |
|
629 #define isLongPrimary(CE) (((CE) & 0xC0) == 0xC0) |
|
630 |
|
631 #define getCETag(CE) (((CE)&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT) |
|
632 #define isContraction(CE) (isSpecial((CE)) && (getCETag((CE)) == CONTRACTION_TAG)) |
|
633 #define isPrefix(CE) (isSpecial((CE)) && (getCETag((CE)) == SPEC_PROC_TAG)) |
|
634 #define constructContractCE(tag, CE) (UCOL_SPECIAL_FLAG | ((tag)<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF)) |
|
635 #define constructSpecProcCE(CE) (UCOL_SPECIAL_FLAG | (SPEC_PROC_TAG<<UCOL_TAG_SHIFT) | ((CE)&0xFFFFFF)) |
|
636 #define getContractOffset(CE) ((CE)&0xFFFFFF) |
|
637 #define getExpansionOffset(CE) (((CE)&0x00FFFFF0)>>4) |
|
638 #define getExpansionCount(CE) ((CE)&0xF) |
|
639 #define isCEIgnorable(CE) (((CE) & 0xFFFFFFBF) == 0) |
|
640 |
|
641 /* StringSearch internal use */ |
|
642 #define inNormBuf(coleiter) ((coleiter)->iteratordata_.flags & UCOL_ITER_INNORMBUF) |
|
643 #define isFCDPointerNull(coleiter) ((coleiter)->iteratordata_.fcdPosition == NULL) |
|
644 #define hasExpansion(coleiter) ((coleiter)->iteratordata_.CEpos != (coleiter)->iteratordata_.CEs) |
|
645 #define getExpansionPrefix(coleiter) ((coleiter)->iteratordata_.toReturn - (coleiter)->iteratordata_.CEs) |
|
646 #define setExpansionPrefix(coleiter, offset) ((coleiter)->iteratordata_.CEs + offset) |
|
647 #define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn) |
|
648 #define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces) |
|
649 |
|
650 /* This is an enum that lists magic special byte values from the fractional UCA. |
|
651 * See also http://site.icu-project.org/design/collation/bytes */ |
|
652 /* TODO: all the #defines that refer to special byte values from the UCA should be changed to point here */ |
|
653 |
|
654 enum { |
|
655 UCOL_BYTE_ZERO = 0x00, |
|
656 UCOL_BYTE_LEVEL_SEPARATOR = 0x01, |
|
657 UCOL_BYTE_SORTKEY_GLUE = 0x02, |
|
658 UCOL_BYTE_SHIFT_PREFIX = 0x03, |
|
659 UCOL_BYTE_UNSHIFTED_MIN = UCOL_BYTE_SHIFT_PREFIX, |
|
660 UCOL_BYTE_FIRST_TAILORED = 0x04, |
|
661 UCOL_BYTE_COMMON = 0x05, |
|
662 UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON, |
|
663 /* TODO: Make the following values dynamic since they change with almost every UCA version. */ |
|
664 UCOL_CODAN_PLACEHOLDER = 0x12, |
|
665 UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x5B, |
|
666 UCOL_BYTE_UNSHIFTED_MAX = 0xFF |
|
667 }; |
|
668 |
|
669 #if 0 |
|
670 #define UCOL_RESET_TOP_VALUE 0x9F000303 |
|
671 #define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705 |
|
672 #define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05 |
|
673 #define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x000051C0 |
|
674 #define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000 |
|
675 #define UCOL_LAST_SECONDARY_IGNORABLE 0x00000500 |
|
676 #define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000 |
|
677 #define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000 |
|
678 #define UCOL_FIRST_VARIABLE 0x05070505 |
|
679 #define UCOL_LAST_VARIABLE 0x179B0505 |
|
680 #define UCOL_FIRST_NON_VARIABLE 0x1A200505 |
|
681 #define UCOL_LAST_NON_VARIABLE 0x7B41058F |
|
682 |
|
683 #define UCOL_NEXT_TOP_VALUE 0xE8960303 |
|
684 #define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905 |
|
685 #define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303 |
|
686 #define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705 |
|
687 #define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000500 |
|
688 #define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00000000 |
|
689 #define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00000000 |
|
690 #define UCOL_NEXT_FIRST_VARIABLE 0x05090505 |
|
691 #define UCOL_NEXT_LAST_VARIABLE 0x1A200505 |
|
692 |
|
693 #define PRIMARY_IMPLICIT_MIN 0xE8000000 |
|
694 #define PRIMARY_IMPLICIT_MAX 0xF0000000 |
|
695 #endif |
|
696 |
|
697 /* These constants can be changed - sortkey size is affected by them */ |
|
698 #define UCOL_PROPORTION2 0.5 |
|
699 #define UCOL_PROPORTION3 0.667 |
|
700 |
|
701 /* These values come from the UCA */ |
|
702 #define UCOL_COMMON_BOT2 UCOL_BYTE_COMMON |
|
703 #define UCOL_COMMON_TOP2 0x86u |
|
704 #define UCOL_TOTAL2 (UCOL_COMMON_TOP2-UCOL_COMMON_BOT2-1) |
|
705 |
|
706 #define UCOL_FLAG_BIT_MASK_CASE_SW_OFF 0x80 |
|
707 #define UCOL_FLAG_BIT_MASK_CASE_SW_ON 0x40 |
|
708 #define UCOL_COMMON_TOP3_CASE_SW_OFF 0x85 |
|
709 #define UCOL_COMMON_TOP3_CASE_SW_LOWER 0x45 |
|
710 #define UCOL_COMMON_TOP3_CASE_SW_UPPER 0xC5 |
|
711 |
|
712 /* These values come from the UCA */ |
|
713 #define UCOL_COMMON_BOT3 0x05 |
|
714 |
|
715 #define UCOL_COMMON_BOTTOM3_CASE_SW_UPPER 0x86; |
|
716 #define UCOL_COMMON_BOTTOM3_CASE_SW_LOWER UCOL_COMMON_BOT3; |
|
717 |
|
718 #define UCOL_TOP_COUNT2 (UCOL_PROPORTION2*UCOL_TOTAL2) |
|
719 #define UCOL_BOT_COUNT2 (UCOL_TOTAL2-UCOL_TOP_COUNT2) |
|
720 |
|
721 |
|
722 #define UCOL_COMMON2 UCOL_COMMON_BOT2 |
|
723 #define UCOL_COMMON3_UPPERFIRST 0xC5 |
|
724 #define UCOL_COMMON3_NORMAL UCOL_COMMON_BOT3 |
|
725 |
|
726 #define UCOL_COMMON4 0xFF |
|
727 |
|
728 /* constants for case level/case first handling */ |
|
729 /* used to instantiate UCollators fields in ucol_updateInternalState */ |
|
730 #define UCOL_CASE_SWITCH 0xC0 |
|
731 #define UCOL_NO_CASE_SWITCH 0x00 |
|
732 |
|
733 #define UCOL_REMOVE_CASE 0x3F |
|
734 #define UCOL_KEEP_CASE 0xFF |
|
735 |
|
736 #define UCOL_CASE_BIT_MASK 0xC0 |
|
737 |
|
738 #define UCOL_TERT_CASE_MASK 0xFF |
|
739 |
|
740 #define UCOL_ENDOFLATINONERANGE 0xFF |
|
741 #define UCOL_LATINONETABLELEN (UCOL_ENDOFLATINONERANGE+50) |
|
742 #define UCOL_BAIL_OUT_CE 0xFF000000 |
|
743 |
|
744 |
|
745 typedef enum { |
|
746 NOT_FOUND_TAG = 0, |
|
747 EXPANSION_TAG = 1, /* This code point results in an expansion */ |
|
748 CONTRACTION_TAG = 2, /* Start of a contraction */ |
|
749 THAI_TAG = 3, /* Thai character - do the reordering */ |
|
750 CHARSET_TAG = 4, /* Charset processing, not yet implemented */ |
|
751 SURROGATE_TAG = 5, /* Lead surrogate that is tailored and doesn't start a contraction */ |
|
752 HANGUL_SYLLABLE_TAG = 6, /* AC00-D7AF*/ |
|
753 LEAD_SURROGATE_TAG = 7, /* D800-DBFF*/ |
|
754 TRAIL_SURROGATE_TAG = 8, /* DC00-DFFF*/ |
|
755 CJK_IMPLICIT_TAG = 9, /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ |
|
756 IMPLICIT_TAG = 10, |
|
757 SPEC_PROC_TAG = 11, |
|
758 /* ICU 2.1 */ |
|
759 LONG_PRIMARY_TAG = 12, /* This is a three byte primary with starting secondaries and tertiaries */ |
|
760 /* It fits in a single 32 bit CE and is used instead of expansion to save */ |
|
761 /* space without affecting the performance (hopefully) */ |
|
762 |
|
763 DIGIT_TAG = 13, /* COllate Digits As Numbers (CODAN) implementation */ |
|
764 |
|
765 CE_TAGS_COUNT |
|
766 } UColCETags; |
|
767 |
|
768 /* |
|
769 ***************************************************************************************** |
|
770 * set to zero |
|
771 * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF |
|
772 ****************************************************************************************** |
|
773 */ |
|
774 |
|
775 typedef struct { |
|
776 uint32_t variableTopValue; |
|
777 /*UColAttributeValue*/ int32_t frenchCollation; |
|
778 /*UColAttributeValue*/ int32_t alternateHandling; /* attribute for handling variable elements*/ |
|
779 /*UColAttributeValue*/ int32_t caseFirst; /* who goes first, lower case or uppercase */ |
|
780 /*UColAttributeValue*/ int32_t caseLevel; /* do we have an extra case level */ |
|
781 /*UColAttributeValue*/ int32_t normalizationMode; /* attribute for normalization */ |
|
782 /*UColAttributeValue*/ int32_t strength; /* attribute for strength */ |
|
783 /*UColAttributeValue*/ int32_t hiraganaQ; /* attribute for special Hiragana */ |
|
784 /*UColAttributeValue*/ int32_t numericCollation; /* attribute for numeric collation */ |
|
785 uint32_t reserved[15]; /* for future use */ |
|
786 } UColOptionSet; |
|
787 |
|
788 typedef struct { |
|
789 uint32_t UCA_FIRST_TERTIARY_IGNORABLE[2]; /*0x00000000*/ |
|
790 uint32_t UCA_LAST_TERTIARY_IGNORABLE[2]; /*0x00000000*/ |
|
791 uint32_t UCA_FIRST_PRIMARY_IGNORABLE[2]; /*0x00008705*/ |
|
792 uint32_t UCA_FIRST_SECONDARY_IGNORABLE[2]; /*0x00000000*/ |
|
793 uint32_t UCA_LAST_SECONDARY_IGNORABLE[2]; /*0x00000500*/ |
|
794 uint32_t UCA_LAST_PRIMARY_IGNORABLE[2]; /*0x0000DD05*/ |
|
795 uint32_t UCA_FIRST_VARIABLE[2]; /*0x05070505*/ |
|
796 uint32_t UCA_LAST_VARIABLE[2]; /*0x13CF0505*/ |
|
797 uint32_t UCA_FIRST_NON_VARIABLE[2]; /*0x16200505*/ |
|
798 uint32_t UCA_LAST_NON_VARIABLE[2]; /*0x767C0505*/ |
|
799 uint32_t UCA_RESET_TOP_VALUE[2]; /*0x9F000303*/ |
|
800 uint32_t UCA_FIRST_IMPLICIT[2]; |
|
801 uint32_t UCA_LAST_IMPLICIT[2]; |
|
802 uint32_t UCA_FIRST_TRAILING[2]; |
|
803 uint32_t UCA_LAST_TRAILING[2]; |
|
804 |
|
805 #if 0 |
|
806 uint32_t UCA_NEXT_TOP_VALUE[2]; /*0xE8960303*/ |
|
807 uint32_t UCA_NEXT_FIRST_PRIMARY_IGNORABLE; /*0x00008905*/ |
|
808 uint32_t UCA_NEXT_LAST_PRIMARY_IGNORABLE; /*0x03000303*/ |
|
809 uint32_t UCA_NEXT_FIRST_SECONDARY_IGNORABLE; /*0x00008705*/ |
|
810 uint32_t UCA_NEXT_LAST_SECONDARY_IGNORABLE; /*0x00000500*/ |
|
811 uint32_t UCA_NEXT_FIRST_TERTIARY_IGNORABLE; /*0x00000000*/ |
|
812 uint32_t UCA_NEXT_LAST_TERTIARY_IGNORABLE; /*0x00000000*/ |
|
813 uint32_t UCA_NEXT_FIRST_VARIABLE; /*0x05090505*/ |
|
814 uint32_t UCA_NEXT_LAST_VARIABLE; /*0x16200505*/ |
|
815 #endif |
|
816 |
|
817 uint32_t UCA_PRIMARY_TOP_MIN; |
|
818 uint32_t UCA_PRIMARY_IMPLICIT_MIN; /*0xE8000000*/ |
|
819 uint32_t UCA_PRIMARY_IMPLICIT_MAX; /*0xF0000000*/ |
|
820 uint32_t UCA_PRIMARY_TRAILING_MIN; /*0xE8000000*/ |
|
821 uint32_t UCA_PRIMARY_TRAILING_MAX; /*0xF0000000*/ |
|
822 uint32_t UCA_PRIMARY_SPECIAL_MIN; /*0xE8000000*/ |
|
823 uint32_t UCA_PRIMARY_SPECIAL_MAX; /*0xF0000000*/ |
|
824 } UCAConstants; |
|
825 |
|
826 /* definition of UCATableHeader moved to common/ucol_data.h */ |
|
827 |
|
828 #define U_UNKNOWN_STATE 0 |
|
829 #define U_COLLATOR_STATE 0x01 |
|
830 #define U_STATE_LIMIT 0x02 |
|
831 |
|
832 /* This is the first structure in a state */ |
|
833 /* it should be machine independent */ |
|
834 typedef struct { |
|
835 /* this structure is supposed to be readable on all the platforms.*/ |
|
836 /* first 2 fields hold the size of the structure in a platform independent way */ |
|
837 uint8_t sizeLo; |
|
838 uint8_t sizeHi; |
|
839 /* identifying the writing platform */ |
|
840 uint8_t isBigEndian; |
|
841 /* see U_CHARSET_FAMILY values in utypes.h */ |
|
842 uint8_t charsetFamily; |
|
843 /* version of ICU this state structure comes from */ |
|
844 uint8_t icuVersion[4]; |
|
845 /* What is the data following this state */ |
|
846 uint8_t type; |
|
847 /* more stuff to come, keep it on 16 byte boundary */ |
|
848 uint8_t reserved[7]; |
|
849 } UStateStruct; |
|
850 |
|
851 /* This structure follows UStatusStruct */ |
|
852 /* and contains data specific for the collators */ |
|
853 /* Endianess needs to be decided before accessing this structure */ |
|
854 /* However, it's size IS endianess independent */ |
|
855 typedef struct { |
|
856 /* size of this structure */ |
|
857 uint8_t sizeLo; |
|
858 uint8_t sizeHi; |
|
859 /* This state is followed by the frozen tailoring */ |
|
860 uint8_t containsTailoring; |
|
861 /* This state is followed by the frozen UCA */ |
|
862 uint8_t containsUCA; |
|
863 /* Version info - the same one */ |
|
864 uint8_t versionInfo[4]; |
|
865 |
|
866 /* for charset CEs */ |
|
867 uint8_t charsetName[32]; |
|
868 /* this is the resolved locale name*/ |
|
869 uint8_t locale[32]; |
|
870 |
|
871 /* Attributes. Open ended */ |
|
872 /* all the following will be moved to uint32_t because of portability */ |
|
873 /* variable top value */ |
|
874 uint32_t variableTopValue; |
|
875 /* attribute for handling variable elements*/ |
|
876 uint32_t /*UColAttributeValue*/ alternateHandling; |
|
877 /* how to handle secondary weights */ |
|
878 uint32_t /*UColAttributeValue*/ frenchCollation; |
|
879 /* who goes first, lower case or uppercase */ |
|
880 uint32_t /*UColAttributeValue*/ caseFirst; |
|
881 /* do we have an extra case level */ |
|
882 uint32_t /*UColAttributeValue*/ caseLevel; |
|
883 /* attribute for normalization */ |
|
884 uint32_t /*UColAttributeValue*/ normalizationMode; |
|
885 /* attribute for strength */ |
|
886 uint32_t /*UColAttributeValue*/ strength; |
|
887 /* to be immediately 16 byte aligned */ |
|
888 uint8_t reserved[12]; |
|
889 } UColStateStruct; |
|
890 |
|
891 #define UCOL_INV_SIZEMASK 0xFFF00000 |
|
892 #define UCOL_INV_OFFSETMASK 0x000FFFFF |
|
893 #define UCOL_INV_SHIFTVALUE 20 |
|
894 |
|
895 U_CDECL_BEGIN |
|
896 |
|
897 /* definition of InverseUCATableHeader moved to common/ucol_data.h */ |
|
898 |
|
899 typedef void U_CALLCONV |
|
900 ResourceCleaner(UCollator *coll); |
|
901 |
|
902 |
|
903 struct UCollator { |
|
904 UColOptionSet *options; |
|
905 SortKeyGenerator *sortKeyGen; |
|
906 uint32_t *latinOneCEs; |
|
907 char* actualLocale; |
|
908 char* validLocale; |
|
909 char* requestedLocale; |
|
910 const UChar *rules; |
|
911 const UChar *ucaRules; |
|
912 const UCollator *UCA; |
|
913 const UCATableHeader *image; |
|
914 UTrie mapping; |
|
915 const uint32_t *latinOneMapping; |
|
916 const uint32_t *expansion; |
|
917 const UChar *contractionIndex; |
|
918 const uint32_t *contractionCEs; |
|
919 |
|
920 const uint32_t *endExpansionCE; /* array of last ces in an expansion ce. |
|
921 corresponds to expansionCESize */ |
|
922 const uint32_t *lastEndExpansionCE;/* pointer to the last element in endExpansionCE */ |
|
923 const uint8_t *expansionCESize; /* array of the maximum size of a |
|
924 expansion ce with the last ce |
|
925 corresponding to endExpansionCE, |
|
926 terminated with a null */ |
|
927 const uint8_t *unsafeCP; /* unsafe code points hashtable */ |
|
928 const uint8_t *contrEndCP; /* Contraction ending chars hash table */ |
|
929 UChar minUnsafeCP; /* Smallest unsafe Code Point. */ |
|
930 UChar minContrEndCP; /* Smallest code point at end of a contraction */ |
|
931 |
|
932 int32_t rulesLength; |
|
933 int32_t latinOneTableLen; |
|
934 |
|
935 uint32_t variableTopValue; |
|
936 UColAttributeValue frenchCollation; |
|
937 UColAttributeValue alternateHandling; /* attribute for handling variable elements*/ |
|
938 UColAttributeValue caseFirst; /* who goes first, lower case or uppercase */ |
|
939 UColAttributeValue caseLevel; /* do we have an extra case level */ |
|
940 UColAttributeValue normalizationMode; /* attribute for normalization */ |
|
941 UColAttributeValue strength; /* attribute for strength */ |
|
942 UColAttributeValue hiraganaQ; /* attribute for Hiragana */ |
|
943 UColAttributeValue numericCollation; |
|
944 UBool variableTopValueisDefault; |
|
945 UBool frenchCollationisDefault; |
|
946 UBool alternateHandlingisDefault; /* attribute for handling variable elements*/ |
|
947 UBool caseFirstisDefault; /* who goes first, lower case or uppercase */ |
|
948 UBool caseLevelisDefault; /* do we have an extra case level */ |
|
949 UBool normalizationModeisDefault; /* attribute for normalization */ |
|
950 UBool strengthisDefault; /* attribute for strength */ |
|
951 UBool hiraganaQisDefault; /* attribute for Hiragana */ |
|
952 UBool numericCollationisDefault; |
|
953 UBool hasRealData; /* some collators have only options, like French, no rules */ |
|
954 /* to speed up things, we use the UCA image, but we don't want it */ |
|
955 /* to run around */ |
|
956 |
|
957 UBool freeOnClose; |
|
958 UBool freeOptionsOnClose; |
|
959 UBool freeRulesOnClose; |
|
960 UBool freeImageOnClose; |
|
961 UBool freeDefaultReorderCodesOnClose; |
|
962 UBool freeReorderCodesOnClose; |
|
963 UBool freeLeadBytePermutationTableOnClose; |
|
964 |
|
965 UBool latinOneUse; |
|
966 UBool latinOneRegenTable; |
|
967 UBool latinOneFailed; |
|
968 |
|
969 int8_t tertiaryAddition; /* when switching case, we need to add or subtract different values */ |
|
970 uint8_t caseSwitch; |
|
971 uint8_t tertiaryCommon; |
|
972 uint8_t tertiaryMask; |
|
973 uint8_t tertiaryTop; /* Upper range when compressing */ |
|
974 uint8_t tertiaryBottom; /* Upper range when compressing */ |
|
975 uint8_t tertiaryTopCount; |
|
976 uint8_t tertiaryBottomCount; |
|
977 |
|
978 UVersionInfo dataVersion; /* Data info of UCA table */ |
|
979 int32_t* defaultReorderCodes; |
|
980 int32_t defaultReorderCodesLength; |
|
981 int32_t* reorderCodes; |
|
982 int32_t reorderCodesLength; |
|
983 uint8_t* leadBytePermutationTable; |
|
984 void *delegate; /* if non-null: C++ object to delegate all API calls to. */ |
|
985 }; |
|
986 |
|
987 U_CDECL_END |
|
988 |
|
989 /* various internal functions */ |
|
990 |
|
991 /* do not close UCA returned by ucol_initUCA! */ |
|
992 U_CFUNC |
|
993 UCollator* ucol_initUCA(UErrorCode *status); |
|
994 |
|
995 U_CFUNC |
|
996 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status); |
|
997 |
|
998 U_CFUNC |
|
999 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status); |
|
1000 |
|
1001 U_CFUNC |
|
1002 UCollator* ucol_open_internal(const char* loc, UErrorCode* status); |
|
1003 |
|
1004 #if 0 |
|
1005 U_CFUNC |
|
1006 void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status); |
|
1007 #endif |
|
1008 |
|
1009 U_CFUNC |
|
1010 void ucol_updateInternalState(UCollator *coll, UErrorCode *status); |
|
1011 |
|
1012 U_CFUNC uint32_t U_EXPORT2 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status); |
|
1013 U_CAPI UBool U_EXPORT2 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status); |
|
1014 |
|
1015 U_CAPI const InverseUCATableHeader* U_EXPORT2 ucol_initInverseUCA(UErrorCode *status); |
|
1016 |
|
1017 U_CAPI void U_EXPORT2 |
|
1018 uprv_uca_initImplicitConstants(UErrorCode *status); |
|
1019 |
|
1020 U_CAPI uint32_t U_EXPORT2 |
|
1021 uprv_uca_getImplicitFromRaw(UChar32 cp); |
|
1022 |
|
1023 /*U_CFUNC uint32_t U_EXPORT2 |
|
1024 uprv_uca_getImplicitPrimary(UChar32 cp);*/ |
|
1025 |
|
1026 U_CAPI UChar32 U_EXPORT2 |
|
1027 uprv_uca_getRawFromImplicit(uint32_t implicit); |
|
1028 |
|
1029 U_CAPI UChar32 U_EXPORT2 |
|
1030 uprv_uca_getRawFromCodePoint(UChar32 i); |
|
1031 |
|
1032 U_CAPI UChar32 U_EXPORT2 |
|
1033 uprv_uca_getCodePointFromRaw(UChar32 i); |
|
1034 |
|
1035 typedef const UChar* GetCollationRulesFunction(void* context, const char* locale, const char* type, int32_t* pLength, UErrorCode* status); |
|
1036 |
|
1037 U_CAPI UCollator* U_EXPORT2 |
|
1038 ucol_openRulesForImport( const UChar *rules, |
|
1039 int32_t rulesLength, |
|
1040 UColAttributeValue normalizationMode, |
|
1041 UCollationStrength strength, |
|
1042 UParseError *parseError, |
|
1043 GetCollationRulesFunction importFunc, |
|
1044 void* context, |
|
1045 UErrorCode *status); |
|
1046 |
|
1047 |
|
1048 U_CFUNC void U_EXPORT2 |
|
1049 ucol_buildPermutationTable(UCollator *coll, UErrorCode *status); |
|
1050 |
|
1051 U_CFUNC int U_EXPORT2 |
|
1052 ucol_getLeadBytesForReorderCode(const UCollator *uca, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity); |
|
1053 |
|
1054 U_CFUNC int U_EXPORT2 |
|
1055 ucol_getReorderCodesForLeadByte(const UCollator *uca, int leadByte, int16_t* returnReorderCodes, int returnCapacity); |
|
1056 |
|
1057 #ifdef __cplusplus |
|
1058 /* |
|
1059 * Test whether a character is potentially "unsafe" for use as a collation |
|
1060 * starting point. Unsafe chars are those with combining class != 0 plus |
|
1061 * those that are the 2nd thru nth character in a contraction sequence. |
|
1062 * |
|
1063 * Function is in header file because it's used in both collation and string search, |
|
1064 * and needs to be inline for performance. |
|
1065 */ |
|
1066 static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) { |
|
1067 int32_t hash; |
|
1068 uint8_t htbyte; |
|
1069 |
|
1070 if (c < coll->minUnsafeCP) { |
|
1071 return FALSE; |
|
1072 } |
|
1073 |
|
1074 hash = c; |
|
1075 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { |
|
1076 if(U16_IS_SURROGATE(c)) { |
|
1077 /* Lead or trail surrogate */ |
|
1078 /* These are always considered unsafe. */ |
|
1079 return TRUE; |
|
1080 } |
|
1081 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; |
|
1082 } |
|
1083 htbyte = coll->unsafeCP[hash>>3]; |
|
1084 return ((htbyte >> (hash & 7)) & 1); |
|
1085 } |
|
1086 #endif /* __cplusplus */ |
|
1087 |
|
1088 /* The offsetBuffer in collIterate might need to be freed to avoid memory leaks. */ |
|
1089 void ucol_freeOffsetBuffer(U_NAMESPACE_QUALIFIER collIterate *s); |
|
1090 |
|
1091 #endif /* #if !UCONFIG_NO_COLLATION */ |
|
1092 |
|
1093 #endif |