1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unormimp.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,486 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2001-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: unormimp.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2001may25 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#ifndef __UNORMIMP_H__ 1.21 +#define __UNORMIMP_H__ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 + 1.25 +#if !UCONFIG_NO_NORMALIZATION 1.26 + 1.27 +#include "udataswp.h" 1.28 + 1.29 +/* 1.30 + * The 2001-2010 implementation of the normalization code loads its data from 1.31 + * unorm.icu, which is generated with the gennorm tool. 1.32 + * The format of that file is described at the end of this file. 1.33 + */ 1.34 + 1.35 +/* norm32 value constants */ 1.36 +enum { 1.37 + /* quick check flags 0..3 set mean "no" for their forms */ 1.38 + _NORM_QC_NFC=0x11, /* no|maybe */ 1.39 + _NORM_QC_NFKC=0x22, /* no|maybe */ 1.40 + _NORM_QC_NFD=4, /* no */ 1.41 + _NORM_QC_NFKD=8, /* no */ 1.42 + 1.43 + _NORM_QC_ANY_NO=0xf, 1.44 + 1.45 + /* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */ 1.46 + _NORM_QC_MAYBE=0x10, 1.47 + _NORM_QC_ANY_MAYBE=0x30, 1.48 + 1.49 + _NORM_QC_MASK=0x3f, 1.50 + 1.51 + _NORM_COMBINES_FWD=0x40, 1.52 + _NORM_COMBINES_BACK=0x80, 1.53 + _NORM_COMBINES_ANY=0xc0, 1.54 + 1.55 + _NORM_CC_SHIFT=8, /* UnicodeData.txt combining class in bits 15..8 */ 1.56 + _NORM_CC_MASK=0xff00, 1.57 + 1.58 + _NORM_EXTRA_SHIFT=16, /* 16 bits for the index to UChars and other extra data */ 1.59 + _NORM_EXTRA_INDEX_TOP=0xfc00, /* start of surrogate specials after shift */ 1.60 + 1.61 + _NORM_EXTRA_SURROGATE_MASK=0x3ff, 1.62 + _NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */ 1.63 + 1.64 + _NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP, 1.65 + _NORM_EXTRA_JAMO_L, 1.66 + _NORM_EXTRA_JAMO_V, 1.67 + _NORM_EXTRA_JAMO_T 1.68 +}; 1.69 + 1.70 +/* norm32 value constants using >16 bits */ 1.71 +#define _NORM_MIN_SPECIAL 0xfc000000 1.72 +#define _NORM_SURROGATES_TOP 0xfff00000 1.73 +#define _NORM_MIN_HANGUL 0xfff00000 1.74 +#define _NORM_MIN_JAMO_V 0xfff20000 1.75 +#define _NORM_JAMO_V_TOP 0xfff30000 1.76 + 1.77 +/* value constants for auxTrie */ 1.78 +enum { 1.79 + _NORM_AUX_COMP_EX_SHIFT=10, 1.80 + _NORM_AUX_UNSAFE_SHIFT=11, 1.81 + _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12 1.82 +}; 1.83 + 1.84 +#define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT) 1.85 + 1.86 +#define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1) 1.87 +#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT) 1.88 +#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT) 1.89 +#define _NORM_AUX_NFC_SKIP_F_MASK ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT) 1.90 + 1.91 +/* canonStartSets[0..31] contains indexes for what is in the array */ 1.92 +enum { 1.93 + _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */ 1.94 + _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */ 1.95 + _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */ 1.96 + 1.97 + /* from formatVersion 2.3: */ 1.98 + _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the 1.99 + exclusion set for CJK compatibility characters */ 1.100 + _NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the 1.101 + exclusion set for Unicode 3.2 characters */ 1.102 + _NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the 1.103 + end of the previous exclusion set */ 1.104 + 1.105 + _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */ 1.106 +}; 1.107 + 1.108 +/* more constants for canonical starter sets */ 1.109 + 1.110 +/* 14 bit indexes to canonical USerializedSets */ 1.111 +#define _NORM_MAX_CANON_SETS 0x4000 1.112 + 1.113 +/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */ 1.114 +#define _NORM_CANON_SET_BMP_MASK 0xc000 1.115 +#define _NORM_CANON_SET_BMP_IS_INDEX 0x4000 1.116 + 1.117 +/* indexes[] value names */ 1.118 +enum { 1.119 + _NORM_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */ 1.120 + _NORM_INDEX_UCHAR_COUNT, /* number of UChars in extra data */ 1.121 + 1.122 + _NORM_INDEX_COMBINE_DATA_COUNT, /* number of uint16_t words for combining data */ 1.123 + _NORM_INDEX_COMBINE_FWD_COUNT, /* number of code points that combine forward */ 1.124 + _NORM_INDEX_COMBINE_BOTH_COUNT, /* number of code points that combine forward and backward */ 1.125 + _NORM_INDEX_COMBINE_BACK_COUNT, /* number of code points that combine backward */ 1.126 + 1.127 + _NORM_INDEX_MIN_NFC_NO_MAYBE, /* first code point with quick check NFC NO/MAYBE */ 1.128 + _NORM_INDEX_MIN_NFKC_NO_MAYBE, /* first code point with quick check NFKC NO/MAYBE */ 1.129 + _NORM_INDEX_MIN_NFD_NO_MAYBE, /* first code point with quick check NFD NO/MAYBE */ 1.130 + _NORM_INDEX_MIN_NFKD_NO_MAYBE, /* first code point with quick check NFKD NO/MAYBE */ 1.131 + 1.132 + _NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */ 1.133 + 1.134 + _NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */ 1.135 + _NORM_INDEX_CANON_SET_COUNT, /* number of uint16_t in the array of serialized USet */ 1.136 + 1.137 + _NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */ 1.138 +}; 1.139 + 1.140 +enum { 1.141 + /* FCD check: everything below this code point is known to have a 0 lead combining class */ 1.142 + _NORM_MIN_WITH_LEAD_CC=0x300 1.143 +}; 1.144 + 1.145 +enum { 1.146 + /** 1.147 + * Bit 7 of the length byte for a decomposition string in extra data is 1.148 + * a flag indicating whether the decomposition string is 1.149 + * preceded by a 16-bit word with the leading and trailing cc 1.150 + * of the decomposition (like for A-umlaut); 1.151 + * if not, then both cc's are zero (like for compatibility ideographs). 1.152 + */ 1.153 + _NORM_DECOMP_FLAG_LENGTH_HAS_CC=0x80, 1.154 + /** 1.155 + * Bits 6..0 of the length byte contain the actual length. 1.156 + */ 1.157 + _NORM_DECOMP_LENGTH_MASK=0x7f 1.158 +}; 1.159 + 1.160 +/** Constants for options flags for normalization. */ 1.161 +enum { 1.162 + /** Options bit 0, do not decompose Hangul syllables. */ 1.163 + UNORM_NX_HANGUL=1, 1.164 + /** Options bit 1, do not decompose CJK compatibility characters. */ 1.165 + UNORM_NX_CJK_COMPAT=2 1.166 +}; 1.167 + 1.168 +/** 1.169 + * Description of the format of unorm.icu version 2.3. 1.170 + * 1.171 + * Main change from version 1 to version 2: 1.172 + * Use of new, common UTrie instead of normalization-specific tries. 1.173 + * Change to version 2.1: add third/auxiliary trie with associated data. 1.174 + * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK). 1.175 + * Change to version 2.3: add serialized sets for normalization exclusions 1.176 + * stored inside canonStartSets[] 1.177 + * 1.178 + * For more details of how to use the data structures see the code 1.179 + * in unorm.cpp (runtime normalization code) and 1.180 + * in gennorm.c and gennorm/store.c (build-time data generation). 1.181 + * 1.182 + * For the serialized format of UTrie see utrie.c/UTrieHeader. 1.183 + * 1.184 + * - Overall partition 1.185 + * 1.186 + * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c. 1.187 + * After that there are the following structures: 1.188 + * 1.189 + * int32_t indexes[_NORM_INDEX_TOP]; -- _NORM_INDEX_TOP=32, see enum in this file 1.190 + * 1.191 + * UTrie normTrie; -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE] 1.192 + * 1.193 + * uint16_t extraData[extraDataTop]; -- extraDataTop=indexes[_NORM_INDEX_UCHAR_COUNT] 1.194 + * extraData[0] contains the number of units for 1.195 + * FC_NFKC_Closure (formatVersion>=2.1) 1.196 + * 1.197 + * uint16_t combiningTable[combiningTableTop]; -- combiningTableTop=indexes[_NORM_INDEX_COMBINE_DATA_COUNT] 1.198 + * combiningTableTop may include one 16-bit padding unit 1.199 + * to make sure that fcdTrie is 32-bit-aligned 1.200 + * 1.201 + * UTrie fcdTrie; -- size in bytes=indexes[_NORM_INDEX_FCD_TRIE_SIZE] 1.202 + * 1.203 + * UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE] 1.204 + * 1.205 + * uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT] 1.206 + * serialized USets and binary search tables, see below 1.207 + * 1.208 + * 1.209 + * The indexes array contains lengths and sizes of the following arrays and structures 1.210 + * as well as the following values: 1.211 + * indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop 1.212 + * -- one more than the highest combining index computed for forward-only-combining characters 1.213 + * indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop 1.214 + * -- number of combining indexes computed for both-ways-combining characters 1.215 + * indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop 1.216 + * -- number of combining indexes computed for backward-only-combining characters 1.217 + * 1.218 + * indexes[_NORM_INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD }) 1.219 + * -- first code point with a quick check NF* value of NO/MAYBE 1.220 + * 1.221 + * 1.222 + * - Tries 1.223 + * 1.224 + * The main structures are two UTrie tables ("compact arrays"), 1.225 + * each with one index array and one data array. 1.226 + * See utrie.h and utrie.c. 1.227 + * 1.228 + * 1.229 + * - Tries in unorm.dat 1.230 + * 1.231 + * The first trie (normTrie above) 1.232 + * provides data for the NF* quick checks and normalization. 1.233 + * The second trie (fcdTrie above) provides data just for FCD checks. 1.234 + * 1.235 + * 1.236 + * - norm32 data words from the first trie 1.237 + * 1.238 + * The norm32Table contains one 32-bit word "norm32" per code point. 1.239 + * It contains the following bit fields: 1.240 + * 31..16 extra data index, _NORM_EXTRA_SHIFT is used to shift this field down 1.241 + * if this index is <_NORM_EXTRA_INDEX_TOP then it is an index into 1.242 + * extraData[] where variable-length normalization data for this 1.243 + * code point is found 1.244 + * if this index is <_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP 1.245 + * then this is a norm32 for a leading surrogate, and the index 1.246 + * value is used together with the following trailing surrogate 1.247 + * code unit in the second trie access 1.248 + * if this index is >=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP 1.249 + * then this is a norm32 for a "special" character, 1.250 + * i.e., the character is a Hangul syllable or a Jamo 1.251 + * see _NORM_EXTRA_HANGUL etc. 1.252 + * generally, instead of extracting this index from the norm32 and 1.253 + * comparing it with the above constants, 1.254 + * the normalization code compares the entire norm32 value 1.255 + * with _NORM_MIN_SPECIAL, _NORM_SURROGATES_TOP, _NORM_MIN_HANGUL etc. 1.256 + * 1.257 + * 15..8 combining class (cc) according to UnicodeData.txt 1.258 + * 1.259 + * 7..6 _NORM_COMBINES_ANY flags, used in composition to see if a character 1.260 + * combines with any following or preceding character(s) 1.261 + * at all 1.262 + * 7 _NORM_COMBINES_BACK 1.263 + * 6 _NORM_COMBINES_FWD 1.264 + * 1.265 + * 5..0 quick check flags, set for "no" or "maybe", with separate flags for 1.266 + * each normalization form 1.267 + * the higher bits are "maybe" flags; for NF*D there are no such flags 1.268 + * the lower bits are "no" flags for all forms, in the same order 1.269 + * as the "maybe" flags, 1.270 + * which is (MSB to LSB): NFKD NFD NFKC NFC 1.271 + * 5..4 _NORM_QC_ANY_MAYBE 1.272 + * 3..0 _NORM_QC_ANY_NO 1.273 + * see further related constants 1.274 + * 1.275 + * 1.276 + * - Extra data per code point 1.277 + * 1.278 + * "Extra data" is referenced by the index in norm32. 1.279 + * It is variable-length data. It is only present, and only those parts 1.280 + * of it are, as needed for a given character. 1.281 + * The norm32 extra data index is added to the beginning of extraData[] 1.282 + * to get to a vector of 16-bit words with data at the following offsets: 1.283 + * 1.284 + * [-1] Combining index for composition. 1.285 + * Stored only if norm32&_NORM_COMBINES_ANY . 1.286 + * [0] Lengths of the canonical and compatibility decomposition strings. 1.287 + * Stored only if there are decompositions, i.e., 1.288 + * if norm32&(_NORM_QC_NFD|_NORM_QC_NFKD) 1.289 + * High byte: length of NFKD, or 0 if none 1.290 + * Low byte: length of NFD, or 0 if none 1.291 + * Each length byte also has another flag: 1.292 + * Bit 7 of a length byte is set if there are non-zero 1.293 + * combining classes (cc's) associated with the respective 1.294 + * decomposition. If this flag is set, then the decomposition 1.295 + * is preceded by a 16-bit word that contains the 1.296 + * leading and trailing cc's. 1.297 + * Bits 6..0 of a length byte are the length of the 1.298 + * decomposition string, not counting the cc word. 1.299 + * [1..n] NFD 1.300 + * [n+1..] NFKD 1.301 + * 1.302 + * Each of the two decompositions consists of up to two parts: 1.303 + * - The 16-bit words with the leading and trailing cc's. 1.304 + * This is only stored if bit 7 of the corresponding length byte 1.305 + * is set. In this case, at least one of the cc's is not zero. 1.306 + * High byte: leading cc==cc of the first code point in the decomposition string 1.307 + * Low byte: trailing cc==cc of the last code point in the decomposition string 1.308 + * - The decomposition string in UTF-16, with length code units. 1.309 + * 1.310 + * 1.311 + * - Combining indexes and combiningTable[] 1.312 + * 1.313 + * Combining indexes are stored at the [-1] offset of the extra data 1.314 + * if the character combines forward or backward with any other characters. 1.315 + * They are used for (re)composition in NF*C. 1.316 + * Values of combining indexes are arranged according to whether a character 1.317 + * combines forward, backward, or both ways: 1.318 + * forward-only < both ways < backward-only 1.319 + * 1.320 + * The index values for forward-only and both-ways combining characters 1.321 + * are indexes into the combiningTable[]. 1.322 + * The index values for backward-only combining characters are simply 1.323 + * incremented from the preceding index values to be unique. 1.324 + * 1.325 + * In the combiningTable[], a variable-length list 1.326 + * of variable-length (back-index, code point) pair entries is stored 1.327 + * for each forward-combining character. 1.328 + * 1.329 + * These back-indexes are the combining indexes of both-ways or backward-only 1.330 + * combining characters that the forward-combining character combines with. 1.331 + * 1.332 + * Each list is sorted in ascending order of back-indexes. 1.333 + * Each list is terminated with the last back-index having bit 15 set. 1.334 + * 1.335 + * Each pair (back-index, code point) takes up either 2 or 3 1.336 + * 16-bit words. 1.337 + * The first word of a list entry is the back-index, with its bit 15 set if 1.338 + * this is the last pair in the list. 1.339 + * 1.340 + * The second word contains flags in bits 15..13 that determine 1.341 + * if there is a third word and how the combined character is encoded: 1.342 + * 15 set if there is a third word in this list entry 1.343 + * 14 set if the result is a supplementary character 1.344 + * 13 set if the result itself combines forward 1.345 + * 1.346 + * According to these bits 15..14 of the second word, 1.347 + * the result character is encoded as follows: 1.348 + * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of 1.349 + * the second word. 1.350 + * 10 The result is 0x2000..0xffff and stored in the third word. 1.351 + * Bits 12..0 of the second word are not used. 1.352 + * 11 The result is a supplementary character. 1.353 + * Bits 9..0 of the leading surrogate are in bits 9..0 of 1.354 + * the second word. 1.355 + * Add 0xd800 to these bits to get the complete surrogate. 1.356 + * Bits 12..10 of the second word are not used. 1.357 + * The trailing surrogate is stored in the third word. 1.358 + * 1.359 + * 1.360 + * - FCD trie 1.361 + * 1.362 + * The FCD trie is very simple. 1.363 + * It is a folded trie with 16-bit data words. 1.364 + * In each word, the high byte contains the leading cc of the character, 1.365 + * and the low byte contains the trailing cc of the character. 1.366 + * These cc's are the cc's of the first and last code points in the 1.367 + * canonical decomposition of the character. 1.368 + * 1.369 + * Since all 16 bits are used for cc's, lead surrogates must be tested 1.370 + * by checking the code unit instead of the trie data. 1.371 + * This is done only if the 16-bit data word is not zero. 1.372 + * If the code unit is a leading surrogate and the data word is not zero, 1.373 + * then instead of cc's it contains the offset for the second trie lookup. 1.374 + * 1.375 + * 1.376 + * - Auxiliary trie and data 1.377 + * 1.378 + * The auxiliary 16-bit trie contains data for additional properties. 1.379 + * Bits 1.380 + * 15..13 reserved 1.381 + * 12 not NFC_Skippable (f) (formatVersion>=2.2) 1.382 + * 11 flag: not a safe starter for canonical closure 1.383 + * 10 composition exclusion 1.384 + * 9.. 0 index into extraData[] to FC_NFKC_Closure string 1.385 + * (not for lead surrogate), 1.386 + * or lead surrogate offset (for lead surrogate, if 9..0 not zero) 1.387 + * 1.388 + * - FC_NFKC_Closure strings in extraData[] 1.389 + * 1.390 + * Strings are either stored as a single code unit or as the length 1.391 + * followed by that many units. 1.392 + * const UChar *s=extraData+(index from auxTrie data bits 9..0); 1.393 + * int32_t length; 1.394 + * if(*s<0xff00) { 1.395 + * // s points to the single-unit string 1.396 + * length=1; 1.397 + * } else { 1.398 + * length=*s&0xff; 1.399 + * ++s; 1.400 + * } 1.401 + * 1.402 + * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable: 1.403 + * (used in NormalizerTransliterator) 1.404 + * 1.405 + * A skippable character is 1.406 + * a) unassigned, or ALL of the following: 1.407 + * b) of combining class 0. 1.408 + * c) not decomposed by this normalization form. 1.409 + * AND if NFC or NFKC, 1.410 + * d) can never compose with a previous character. 1.411 + * e) can never compose with a following character. 1.412 + * f) can never change if another character is added. 1.413 + * Example: a-breve might satisfy all but f, but if you 1.414 + * add an ogonek it changes to a-ogonek + breve 1.415 + * 1.416 + * a)..e) must be tested from norm32. 1.417 + * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built 1.418 + * into the auxiliary trie. 1.419 + * The same bit is used for NFC and NFKC; (c) differs for them. 1.420 + * As usual, we build the "not skippable" flags so that unassigned 1.421 + * code points get a 0 bit. 1.422 + * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well. 1.423 + * Test Hangul LV syllables entirely in code. 1.424 + * 1.425 + * 1.426 + * - structure inside canonStartSets[] 1.427 + * 1.428 + * This array maps from code points c to sets of code points (USerializedSet). 1.429 + * The result sets are the code points whose canonical decompositions start 1.430 + * with c. 1.431 + * 1.432 + * canonStartSets[] contains the following sub-arrays: 1.433 + * 1.434 + * indexes[_NORM_SET_INDEX_TOP] 1.435 + * - contains lengths of sub-arrays etc. 1.436 + * 1.437 + * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP] 1.438 + * - contains serialized sets (USerializedSet) of canonical starters for 1.439 + * enumerating canonically equivalent strings 1.440 + * indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP 1.441 + * for details about the structure see uset.c 1.442 + * 1.443 + * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]] 1.444 + * - a sorted search table for BMP code points whose results are 1.445 + * either indexes to USerializedSets or single code points for 1.446 + * single-code point sets; 1.447 + * each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx 1.448 + * if yy==01 then there is a USerializedSet at canonStartSets+x 1.449 + * else build a USerializedSet with result as the single code point 1.450 + * 1.451 + * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]] 1.452 + * - a sorted search table for supplementary code points whose results are 1.453 + * either indexes to USerializedSets or single code points for 1.454 + * single-code point sets; 1.455 + * each entry is a triplet of { high16(cp), low16(cp), result } 1.456 + * each code point's high-word may contain extra data in bits 15..5: 1.457 + * if the high word has bit 15 set, then build a set with a single code point 1.458 + * which is (((high16(cp)&0x1f00)<<8)|result; 1.459 + * else there is a USerializedSet at canonStartSets+result 1.460 + * 1.461 + * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions. 1.462 + * They are stored in the data file so that the runtime normalization code need 1.463 + * not depend on other properties and their data and implementation files. 1.464 + * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table 1.465 + * give the location for each set. 1.466 + * There is no set stored for UNORM_NX_HANGUL because it's trivial to create 1.467 + * without using properties. 1.468 + * 1.469 + * Set contents: 1.470 + * 1.471 + * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT) 1.472 + * [[:Ideographic:]&[:NFD_QC=No:]] 1.473 + * =[CJK Ideographs]&[has canonical decomposition] 1.474 + * 1.475 + * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2) 1.476 + * [:^Age=3.2:] 1.477 + * =set with all code points that were not designated by the specified Unicode version 1.478 + * 1.479 + * _NORM_SET_INDEX_NX_RESERVED_OFFSET 1.480 + * This is an offset that points to where the next, future set would start. 1.481 + * Currently it indicates where the previous set ends, and thus its length. 1.482 + * The name for this enum constant may in the future be applied to different 1.483 + * index slots. In order to get the limit of a set, use its index slot and 1.484 + * the immediately following one regardless of that one's enum name. 1.485 + */ 1.486 + 1.487 +#endif /* #if !UCONFIG_NO_NORMALIZATION */ 1.488 + 1.489 +#endif