|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2002-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uprops.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002feb24 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Constants for mostly non-core Unicode character properties |
|
17 * stored in uprops.icu. |
|
18 */ |
|
19 |
|
20 #ifndef __UPROPS_H__ |
|
21 #define __UPROPS_H__ |
|
22 |
|
23 #include "unicode/utypes.h" |
|
24 #include "unicode/uset.h" |
|
25 #include "uset_imp.h" |
|
26 #include "udataswp.h" |
|
27 |
|
28 /* indexes[] entries */ |
|
29 enum { |
|
30 UPROPS_PROPS32_INDEX, |
|
31 UPROPS_EXCEPTIONS_INDEX, |
|
32 UPROPS_EXCEPTIONS_TOP_INDEX, |
|
33 |
|
34 UPROPS_ADDITIONAL_TRIE_INDEX, |
|
35 UPROPS_ADDITIONAL_VECTORS_INDEX, |
|
36 UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, |
|
37 |
|
38 UPROPS_SCRIPT_EXTENSIONS_INDEX, |
|
39 |
|
40 UPROPS_RESERVED_INDEX_7, |
|
41 UPROPS_RESERVED_INDEX_8, |
|
42 |
|
43 /* size of the data file (number of 32-bit units after the header) */ |
|
44 UPROPS_DATA_TOP_INDEX, |
|
45 |
|
46 /* maximum values for code values in vector word 0 */ |
|
47 UPROPS_MAX_VALUES_INDEX=10, |
|
48 /* maximum values for code values in vector word 2 */ |
|
49 UPROPS_MAX_VALUES_2_INDEX, |
|
50 |
|
51 UPROPS_INDEX_COUNT=16 |
|
52 }; |
|
53 |
|
54 /* definitions for the main properties words */ |
|
55 enum { |
|
56 /* general category shift==0 0 (5 bits) */ |
|
57 /* reserved 5 (1 bit) */ |
|
58 UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6 /* 6 (10 bits) */ |
|
59 }; |
|
60 |
|
61 #define GET_CATEGORY(props) ((props)&0x1f) |
|
62 #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) |
|
63 |
|
64 #define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT) |
|
65 |
|
66 /* constants for the storage form of numeric types and values */ |
|
67 enum { |
|
68 /** No numeric value. */ |
|
69 UPROPS_NTV_NONE=0, |
|
70 /** Decimal digits: nv=0..9 */ |
|
71 UPROPS_NTV_DECIMAL_START=1, |
|
72 /** Other digits: nv=0..9 */ |
|
73 UPROPS_NTV_DIGIT_START=11, |
|
74 /** Small integers: nv=0..154 */ |
|
75 UPROPS_NTV_NUMERIC_START=21, |
|
76 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ |
|
77 UPROPS_NTV_FRACTION_START=0xb0, |
|
78 /** |
|
79 * Large integers: |
|
80 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) |
|
81 * (only one significant decimal digit) |
|
82 */ |
|
83 UPROPS_NTV_LARGE_START=0x1e0, |
|
84 /** |
|
85 * Sexagesimal numbers: |
|
86 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) |
|
87 */ |
|
88 UPROPS_NTV_BASE60_START=0x300, |
|
89 /** No numeric value (yet). */ |
|
90 UPROPS_NTV_RESERVED_START=UPROPS_NTV_BASE60_START+36, /* 0x300+9*4=0x324 */ |
|
91 |
|
92 UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1 |
|
93 }; |
|
94 |
|
95 #define UPROPS_NTV_GET_TYPE(ntv) \ |
|
96 ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \ |
|
97 (ntv<UPROPS_NTV_DIGIT_START) ? U_NT_DECIMAL : \ |
|
98 (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \ |
|
99 U_NT_NUMERIC) |
|
100 |
|
101 /* number of properties vector words */ |
|
102 #define UPROPS_VECTOR_WORDS 3 |
|
103 |
|
104 /* |
|
105 * Properties in vector word 0 |
|
106 * Bits |
|
107 * 31..24 DerivedAge version major/minor one nibble each |
|
108 * 23..22 3..1: Bits 7..0 = Script_Extensions index |
|
109 * 3: Script value from Script_Extensions |
|
110 * 2: Script=Inherited |
|
111 * 1: Script=Common |
|
112 * 0: Script=bits 7..0 |
|
113 * 21..20 reserved |
|
114 * 19..17 East Asian Width |
|
115 * 16.. 8 UBlockCode |
|
116 * 7.. 0 UScriptCode, or index to Script_Extensions |
|
117 */ |
|
118 |
|
119 /* derived age: one nibble each for major and minor version numbers */ |
|
120 #define UPROPS_AGE_MASK 0xff000000 |
|
121 #define UPROPS_AGE_SHIFT 24 |
|
122 |
|
123 /* Script_Extensions: mask includes Script */ |
|
124 #define UPROPS_SCRIPT_X_MASK 0x00c000ff |
|
125 #define UPROPS_SCRIPT_X_SHIFT 22 |
|
126 |
|
127 #define UPROPS_EA_MASK 0x000e0000 |
|
128 #define UPROPS_EA_SHIFT 17 |
|
129 |
|
130 #define UPROPS_BLOCK_MASK 0x0001ff00 |
|
131 #define UPROPS_BLOCK_SHIFT 8 |
|
132 |
|
133 #define UPROPS_SCRIPT_MASK 0x000000ff |
|
134 |
|
135 /* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ |
|
136 #define UPROPS_SCRIPT_X_WITH_COMMON 0x400000 |
|
137 #define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000 |
|
138 #define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000 |
|
139 |
|
140 /* |
|
141 * Properties in vector word 1 |
|
142 * Each bit encodes one binary property. |
|
143 * The following constants represent the bit number, use 1<<UPROPS_XYZ. |
|
144 * UPROPS_BINARY_1_TOP<=32! |
|
145 * |
|
146 * Keep this list of property enums in sync with |
|
147 * propListNames[] in icu/source/tools/genprops/props2.c! |
|
148 * |
|
149 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". |
|
150 */ |
|
151 enum { |
|
152 UPROPS_WHITE_SPACE, |
|
153 UPROPS_DASH, |
|
154 UPROPS_HYPHEN, |
|
155 UPROPS_QUOTATION_MARK, |
|
156 UPROPS_TERMINAL_PUNCTUATION, |
|
157 UPROPS_MATH, |
|
158 UPROPS_HEX_DIGIT, |
|
159 UPROPS_ASCII_HEX_DIGIT, |
|
160 UPROPS_ALPHABETIC, |
|
161 UPROPS_IDEOGRAPHIC, |
|
162 UPROPS_DIACRITIC, |
|
163 UPROPS_EXTENDER, |
|
164 UPROPS_NONCHARACTER_CODE_POINT, |
|
165 UPROPS_GRAPHEME_EXTEND, |
|
166 UPROPS_GRAPHEME_LINK, |
|
167 UPROPS_IDS_BINARY_OPERATOR, |
|
168 UPROPS_IDS_TRINARY_OPERATOR, |
|
169 UPROPS_RADICAL, |
|
170 UPROPS_UNIFIED_IDEOGRAPH, |
|
171 UPROPS_DEFAULT_IGNORABLE_CODE_POINT, |
|
172 UPROPS_DEPRECATED, |
|
173 UPROPS_LOGICAL_ORDER_EXCEPTION, |
|
174 UPROPS_XID_START, |
|
175 UPROPS_XID_CONTINUE, |
|
176 UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ |
|
177 UPROPS_ID_CONTINUE, |
|
178 UPROPS_GRAPHEME_BASE, |
|
179 UPROPS_S_TERM, /* new in ICU 3.0 and Unicode 4.0.1 */ |
|
180 UPROPS_VARIATION_SELECTOR, |
|
181 UPROPS_PATTERN_SYNTAX, /* new in ICU 3.4 and Unicode 4.1 */ |
|
182 UPROPS_PATTERN_WHITE_SPACE, |
|
183 UPROPS_RESERVED, /* reserved & unused */ |
|
184 UPROPS_BINARY_1_TOP /* ==32 - full! */ |
|
185 }; |
|
186 |
|
187 /* |
|
188 * Properties in vector word 2 |
|
189 * Bits |
|
190 * 31..26 reserved |
|
191 * 25..20 Line Break |
|
192 * 19..15 Sentence Break |
|
193 * 14..10 Word Break |
|
194 * 9.. 5 Grapheme Cluster Break |
|
195 * 4.. 0 Decomposition Type |
|
196 */ |
|
197 #define UPROPS_LB_MASK 0x03f00000 |
|
198 #define UPROPS_LB_SHIFT 20 |
|
199 |
|
200 #define UPROPS_SB_MASK 0x000f8000 |
|
201 #define UPROPS_SB_SHIFT 15 |
|
202 |
|
203 #define UPROPS_WB_MASK 0x00007c00 |
|
204 #define UPROPS_WB_SHIFT 10 |
|
205 |
|
206 #define UPROPS_GCB_MASK 0x000003e0 |
|
207 #define UPROPS_GCB_SHIFT 5 |
|
208 |
|
209 #define UPROPS_DT_MASK 0x0000001f |
|
210 |
|
211 /** |
|
212 * Gets the main properties value for a code point. |
|
213 * Implemented in uchar.c for uprops.cpp. |
|
214 */ |
|
215 U_CFUNC uint32_t |
|
216 u_getMainProperties(UChar32 c); |
|
217 |
|
218 /** |
|
219 * Get a properties vector word for a code point. |
|
220 * Implemented in uchar.c for uprops.cpp. |
|
221 * @return 0 if no data or illegal argument |
|
222 */ |
|
223 U_CFUNC uint32_t |
|
224 u_getUnicodeProperties(UChar32 c, int32_t column); |
|
225 |
|
226 /** |
|
227 * Get the the maximum values for some enum/int properties. |
|
228 * Use the same column numbers as for u_getUnicodeProperties(). |
|
229 * The returned value will contain maximum values stored in the same bit fields |
|
230 * as where the enum values are stored in the u_getUnicodeProperties() |
|
231 * return values for the same columns. |
|
232 * |
|
233 * Valid columns are those for properties words that contain enumerated values. |
|
234 * (ICU 2.6: columns 0 and 2) |
|
235 * For other column numbers, this function will return 0. |
|
236 * |
|
237 * @internal |
|
238 */ |
|
239 U_CFUNC int32_t |
|
240 uprv_getMaxValues(int32_t column); |
|
241 |
|
242 /** |
|
243 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. |
|
244 * @internal |
|
245 */ |
|
246 U_CFUNC UBool |
|
247 u_isalnumPOSIX(UChar32 c); |
|
248 |
|
249 /** |
|
250 * Checks if c is in |
|
251 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] |
|
252 * with space=\p{Whitespace} and Control=Cc. |
|
253 * Implements UCHAR_POSIX_GRAPH. |
|
254 * @internal |
|
255 */ |
|
256 U_CFUNC UBool |
|
257 u_isgraphPOSIX(UChar32 c); |
|
258 |
|
259 /** |
|
260 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. |
|
261 * Implements UCHAR_POSIX_PRINT. |
|
262 * @internal |
|
263 */ |
|
264 U_CFUNC UBool |
|
265 u_isprintPOSIX(UChar32 c); |
|
266 |
|
267 /** Turn a bit index into a bit flag. @internal */ |
|
268 #define FLAG(n) ((uint32_t)1<<(n)) |
|
269 |
|
270 /** Flags for general categories in the order of UCharCategory. @internal */ |
|
271 #define _Cn FLAG(U_GENERAL_OTHER_TYPES) |
|
272 #define _Lu FLAG(U_UPPERCASE_LETTER) |
|
273 #define _Ll FLAG(U_LOWERCASE_LETTER) |
|
274 #define _Lt FLAG(U_TITLECASE_LETTER) |
|
275 #define _Lm FLAG(U_MODIFIER_LETTER) |
|
276 /* #define _Lo FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */ |
|
277 #define _Mn FLAG(U_NON_SPACING_MARK) |
|
278 #define _Me FLAG(U_ENCLOSING_MARK) |
|
279 #define _Mc FLAG(U_COMBINING_SPACING_MARK) |
|
280 #define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER) |
|
281 #define _Nl FLAG(U_LETTER_NUMBER) |
|
282 #define _No FLAG(U_OTHER_NUMBER) |
|
283 #define _Zs FLAG(U_SPACE_SEPARATOR) |
|
284 #define _Zl FLAG(U_LINE_SEPARATOR) |
|
285 #define _Zp FLAG(U_PARAGRAPH_SEPARATOR) |
|
286 #define _Cc FLAG(U_CONTROL_CHAR) |
|
287 #define _Cf FLAG(U_FORMAT_CHAR) |
|
288 #define _Co FLAG(U_PRIVATE_USE_CHAR) |
|
289 #define _Cs FLAG(U_SURROGATE) |
|
290 #define _Pd FLAG(U_DASH_PUNCTUATION) |
|
291 #define _Ps FLAG(U_START_PUNCTUATION) |
|
292 /* #define _Pe FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */ |
|
293 /* #define _Pc FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ |
|
294 #define _Po FLAG(U_OTHER_PUNCTUATION) |
|
295 #define _Sm FLAG(U_MATH_SYMBOL) |
|
296 #define _Sc FLAG(U_CURRENCY_SYMBOL) |
|
297 #define _Sk FLAG(U_MODIFIER_SYMBOL) |
|
298 #define _So FLAG(U_OTHER_SYMBOL) |
|
299 #define _Pi FLAG(U_INITIAL_PUNCTUATION) |
|
300 /* #define _Pf FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ |
|
301 |
|
302 /** Some code points. @internal */ |
|
303 enum { |
|
304 TAB =0x0009, |
|
305 LF =0x000a, |
|
306 FF =0x000c, |
|
307 CR =0x000d, |
|
308 U_A =0x0041, |
|
309 U_F =0x0046, |
|
310 U_Z =0x005a, |
|
311 U_a =0x0061, |
|
312 U_f =0x0066, |
|
313 U_z =0x007a, |
|
314 DEL =0x007f, |
|
315 NL =0x0085, |
|
316 NBSP =0x00a0, |
|
317 CGJ =0x034f, |
|
318 FIGURESP=0x2007, |
|
319 HAIRSP =0x200a, |
|
320 ZWNJ =0x200c, |
|
321 ZWJ =0x200d, |
|
322 RLM =0x200f, |
|
323 NNBSP =0x202f, |
|
324 WJ =0x2060, |
|
325 INHSWAP =0x206a, |
|
326 NOMDIG =0x206f, |
|
327 U_FW_A =0xff21, |
|
328 U_FW_F =0xff26, |
|
329 U_FW_Z =0xff3a, |
|
330 U_FW_a =0xff41, |
|
331 U_FW_f =0xff46, |
|
332 U_FW_z =0xff5a, |
|
333 ZWNBSP =0xfeff |
|
334 }; |
|
335 |
|
336 /** |
|
337 * Get the maximum length of a (regular/1.0/extended) character name. |
|
338 * @return 0 if no character names available. |
|
339 */ |
|
340 U_CAPI int32_t U_EXPORT2 |
|
341 uprv_getMaxCharNameLength(void); |
|
342 |
|
343 /** |
|
344 * Fills set with characters that are used in Unicode character names. |
|
345 * Includes all characters that are used in regular/Unicode 1.0/extended names. |
|
346 * Just empties the set if no character names are available. |
|
347 * @param sa USetAdder to receive characters. |
|
348 */ |
|
349 U_CAPI void U_EXPORT2 |
|
350 uprv_getCharNameCharacters(const USetAdder *sa); |
|
351 |
|
352 /** |
|
353 * Constants for which data and implementation files provide which properties. |
|
354 * Used by UnicodeSet for service-specific property enumeration. |
|
355 * @internal |
|
356 */ |
|
357 enum UPropertySource { |
|
358 /** No source, not a supported property. */ |
|
359 UPROPS_SRC_NONE, |
|
360 /** From uchar.c/uprops.icu main trie */ |
|
361 UPROPS_SRC_CHAR, |
|
362 /** From uchar.c/uprops.icu properties vectors trie */ |
|
363 UPROPS_SRC_PROPSVEC, |
|
364 /** From unames.c/unames.icu */ |
|
365 UPROPS_SRC_NAMES, |
|
366 /** From ucase.c/ucase.icu */ |
|
367 UPROPS_SRC_CASE, |
|
368 /** From ubidi_props.c/ubidi.icu */ |
|
369 UPROPS_SRC_BIDI, |
|
370 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ |
|
371 UPROPS_SRC_CHAR_AND_PROPSVEC, |
|
372 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ |
|
373 UPROPS_SRC_CASE_AND_NORM, |
|
374 /** From normalizer2impl.cpp/nfc.nrm */ |
|
375 UPROPS_SRC_NFC, |
|
376 /** From normalizer2impl.cpp/nfkc.nrm */ |
|
377 UPROPS_SRC_NFKC, |
|
378 /** From normalizer2impl.cpp/nfkc_cf.nrm */ |
|
379 UPROPS_SRC_NFKC_CF, |
|
380 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ |
|
381 UPROPS_SRC_NFC_CANON_ITER, |
|
382 /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */ |
|
383 UPROPS_SRC_COUNT |
|
384 }; |
|
385 typedef enum UPropertySource UPropertySource; |
|
386 |
|
387 /** |
|
388 * @see UPropertySource |
|
389 * @internal |
|
390 */ |
|
391 U_CFUNC UPropertySource U_EXPORT2 |
|
392 uprops_getSource(UProperty which); |
|
393 |
|
394 /** |
|
395 * Enumerate uprops.icu's main data trie and add the |
|
396 * start of each range of same properties to the set. |
|
397 * @internal |
|
398 */ |
|
399 U_CFUNC void U_EXPORT2 |
|
400 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
|
401 |
|
402 /** |
|
403 * Enumerate uprops.icu's properties vectors trie and add the |
|
404 * start of each range of same properties to the set. |
|
405 * @internal |
|
406 */ |
|
407 U_CFUNC void U_EXPORT2 |
|
408 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
|
409 |
|
410 /** |
|
411 * Return a set of characters for property enumeration. |
|
412 * For each two consecutive characters (start, limit) in the set, |
|
413 * all of the properties for start..limit-1 are all the same. |
|
414 * |
|
415 * @param sa USetAdder to receive result. Existing contents are lost. |
|
416 * @internal |
|
417 */ |
|
418 /*U_CFUNC void U_EXPORT2 |
|
419 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); |
|
420 */ |
|
421 |
|
422 /** |
|
423 * Swap the ICU Unicode character names file. See uchar.c. |
|
424 * @internal |
|
425 */ |
|
426 U_CAPI int32_t U_EXPORT2 |
|
427 uchar_swapNames(const UDataSwapper *ds, |
|
428 const void *inData, int32_t length, void *outData, |
|
429 UErrorCode *pErrorCode); |
|
430 |
|
431 #ifdef __cplusplus |
|
432 |
|
433 U_NAMESPACE_BEGIN |
|
434 |
|
435 class UnicodeSet; |
|
436 |
|
437 // implemented in uniset_props.cpp |
|
438 U_CFUNC UnicodeSet * |
|
439 uniset_getUnicode32Instance(UErrorCode &errorCode); |
|
440 |
|
441 U_NAMESPACE_END |
|
442 |
|
443 #endif |
|
444 |
|
445 #endif |