| |
1 /* |
| |
2 ******************************************************************************* |
| |
3 * |
| |
4 * Copyright (C) 2002-2012, International Business Machines |
| |
5 * Corporation and others. All Rights Reserved. |
| |
6 * |
| |
7 ******************************************************************************* |
| |
8 * file name: uprops.h |
| |
9 * encoding: US-ASCII |
| |
10 * tab size: 8 (not used) |
| |
11 * indentation:4 |
| |
12 * |
| |
13 * created on: 2002feb24 |
| |
14 * created by: Markus W. Scherer |
| |
15 * |
| |
16 * Constants for mostly non-core Unicode character properties |
| |
17 * stored in uprops.icu. |
| |
18 */ |
| |
19 |
| |
20 #ifndef __UPROPS_H__ |
| |
21 #define __UPROPS_H__ |
| |
22 |
| |
23 #include "unicode/utypes.h" |
| |
24 #include "unicode/uset.h" |
| |
25 #include "uset_imp.h" |
| |
26 #include "udataswp.h" |
| |
27 |
| |
28 /* indexes[] entries */ |
| |
29 enum { |
| |
30 UPROPS_PROPS32_INDEX, |
| |
31 UPROPS_EXCEPTIONS_INDEX, |
| |
32 UPROPS_EXCEPTIONS_TOP_INDEX, |
| |
33 |
| |
34 UPROPS_ADDITIONAL_TRIE_INDEX, |
| |
35 UPROPS_ADDITIONAL_VECTORS_INDEX, |
| |
36 UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, |
| |
37 |
| |
38 UPROPS_SCRIPT_EXTENSIONS_INDEX, |
| |
39 |
| |
40 UPROPS_RESERVED_INDEX_7, |
| |
41 UPROPS_RESERVED_INDEX_8, |
| |
42 |
| |
43 /* size of the data file (number of 32-bit units after the header) */ |
| |
44 UPROPS_DATA_TOP_INDEX, |
| |
45 |
| |
46 /* maximum values for code values in vector word 0 */ |
| |
47 UPROPS_MAX_VALUES_INDEX=10, |
| |
48 /* maximum values for code values in vector word 2 */ |
| |
49 UPROPS_MAX_VALUES_2_INDEX, |
| |
50 |
| |
51 UPROPS_INDEX_COUNT=16 |
| |
52 }; |
| |
53 |
| |
54 /* definitions for the main properties words */ |
| |
55 enum { |
| |
56 /* general category shift==0 0 (5 bits) */ |
| |
57 /* reserved 5 (1 bit) */ |
| |
58 UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6 /* 6 (10 bits) */ |
| |
59 }; |
| |
60 |
| |
61 #define GET_CATEGORY(props) ((props)&0x1f) |
| |
62 #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) |
| |
63 |
| |
64 #define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT) |
| |
65 |
| |
66 /* constants for the storage form of numeric types and values */ |
| |
67 enum { |
| |
68 /** No numeric value. */ |
| |
69 UPROPS_NTV_NONE=0, |
| |
70 /** Decimal digits: nv=0..9 */ |
| |
71 UPROPS_NTV_DECIMAL_START=1, |
| |
72 /** Other digits: nv=0..9 */ |
| |
73 UPROPS_NTV_DIGIT_START=11, |
| |
74 /** Small integers: nv=0..154 */ |
| |
75 UPROPS_NTV_NUMERIC_START=21, |
| |
76 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ |
| |
77 UPROPS_NTV_FRACTION_START=0xb0, |
| |
78 /** |
| |
79 * Large integers: |
| |
80 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) |
| |
81 * (only one significant decimal digit) |
| |
82 */ |
| |
83 UPROPS_NTV_LARGE_START=0x1e0, |
| |
84 /** |
| |
85 * Sexagesimal numbers: |
| |
86 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) |
| |
87 */ |
| |
88 UPROPS_NTV_BASE60_START=0x300, |
| |
89 /** No numeric value (yet). */ |
| |
90 UPROPS_NTV_RESERVED_START=UPROPS_NTV_BASE60_START+36, /* 0x300+9*4=0x324 */ |
| |
91 |
| |
92 UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1 |
| |
93 }; |
| |
94 |
| |
95 #define UPROPS_NTV_GET_TYPE(ntv) \ |
| |
96 ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \ |
| |
97 (ntv<UPROPS_NTV_DIGIT_START) ? U_NT_DECIMAL : \ |
| |
98 (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \ |
| |
99 U_NT_NUMERIC) |
| |
100 |
| |
101 /* number of properties vector words */ |
| |
102 #define UPROPS_VECTOR_WORDS 3 |
| |
103 |
| |
104 /* |
| |
105 * Properties in vector word 0 |
| |
106 * Bits |
| |
107 * 31..24 DerivedAge version major/minor one nibble each |
| |
108 * 23..22 3..1: Bits 7..0 = Script_Extensions index |
| |
109 * 3: Script value from Script_Extensions |
| |
110 * 2: Script=Inherited |
| |
111 * 1: Script=Common |
| |
112 * 0: Script=bits 7..0 |
| |
113 * 21..20 reserved |
| |
114 * 19..17 East Asian Width |
| |
115 * 16.. 8 UBlockCode |
| |
116 * 7.. 0 UScriptCode, or index to Script_Extensions |
| |
117 */ |
| |
118 |
| |
119 /* derived age: one nibble each for major and minor version numbers */ |
| |
120 #define UPROPS_AGE_MASK 0xff000000 |
| |
121 #define UPROPS_AGE_SHIFT 24 |
| |
122 |
| |
123 /* Script_Extensions: mask includes Script */ |
| |
124 #define UPROPS_SCRIPT_X_MASK 0x00c000ff |
| |
125 #define UPROPS_SCRIPT_X_SHIFT 22 |
| |
126 |
| |
127 #define UPROPS_EA_MASK 0x000e0000 |
| |
128 #define UPROPS_EA_SHIFT 17 |
| |
129 |
| |
130 #define UPROPS_BLOCK_MASK 0x0001ff00 |
| |
131 #define UPROPS_BLOCK_SHIFT 8 |
| |
132 |
| |
133 #define UPROPS_SCRIPT_MASK 0x000000ff |
| |
134 |
| |
135 /* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ |
| |
136 #define UPROPS_SCRIPT_X_WITH_COMMON 0x400000 |
| |
137 #define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000 |
| |
138 #define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000 |
| |
139 |
| |
140 /* |
| |
141 * Properties in vector word 1 |
| |
142 * Each bit encodes one binary property. |
| |
143 * The following constants represent the bit number, use 1<<UPROPS_XYZ. |
| |
144 * UPROPS_BINARY_1_TOP<=32! |
| |
145 * |
| |
146 * Keep this list of property enums in sync with |
| |
147 * propListNames[] in icu/source/tools/genprops/props2.c! |
| |
148 * |
| |
149 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". |
| |
150 */ |
| |
151 enum { |
| |
152 UPROPS_WHITE_SPACE, |
| |
153 UPROPS_DASH, |
| |
154 UPROPS_HYPHEN, |
| |
155 UPROPS_QUOTATION_MARK, |
| |
156 UPROPS_TERMINAL_PUNCTUATION, |
| |
157 UPROPS_MATH, |
| |
158 UPROPS_HEX_DIGIT, |
| |
159 UPROPS_ASCII_HEX_DIGIT, |
| |
160 UPROPS_ALPHABETIC, |
| |
161 UPROPS_IDEOGRAPHIC, |
| |
162 UPROPS_DIACRITIC, |
| |
163 UPROPS_EXTENDER, |
| |
164 UPROPS_NONCHARACTER_CODE_POINT, |
| |
165 UPROPS_GRAPHEME_EXTEND, |
| |
166 UPROPS_GRAPHEME_LINK, |
| |
167 UPROPS_IDS_BINARY_OPERATOR, |
| |
168 UPROPS_IDS_TRINARY_OPERATOR, |
| |
169 UPROPS_RADICAL, |
| |
170 UPROPS_UNIFIED_IDEOGRAPH, |
| |
171 UPROPS_DEFAULT_IGNORABLE_CODE_POINT, |
| |
172 UPROPS_DEPRECATED, |
| |
173 UPROPS_LOGICAL_ORDER_EXCEPTION, |
| |
174 UPROPS_XID_START, |
| |
175 UPROPS_XID_CONTINUE, |
| |
176 UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ |
| |
177 UPROPS_ID_CONTINUE, |
| |
178 UPROPS_GRAPHEME_BASE, |
| |
179 UPROPS_S_TERM, /* new in ICU 3.0 and Unicode 4.0.1 */ |
| |
180 UPROPS_VARIATION_SELECTOR, |
| |
181 UPROPS_PATTERN_SYNTAX, /* new in ICU 3.4 and Unicode 4.1 */ |
| |
182 UPROPS_PATTERN_WHITE_SPACE, |
| |
183 UPROPS_RESERVED, /* reserved & unused */ |
| |
184 UPROPS_BINARY_1_TOP /* ==32 - full! */ |
| |
185 }; |
| |
186 |
| |
187 /* |
| |
188 * Properties in vector word 2 |
| |
189 * Bits |
| |
190 * 31..26 reserved |
| |
191 * 25..20 Line Break |
| |
192 * 19..15 Sentence Break |
| |
193 * 14..10 Word Break |
| |
194 * 9.. 5 Grapheme Cluster Break |
| |
195 * 4.. 0 Decomposition Type |
| |
196 */ |
| |
197 #define UPROPS_LB_MASK 0x03f00000 |
| |
198 #define UPROPS_LB_SHIFT 20 |
| |
199 |
| |
200 #define UPROPS_SB_MASK 0x000f8000 |
| |
201 #define UPROPS_SB_SHIFT 15 |
| |
202 |
| |
203 #define UPROPS_WB_MASK 0x00007c00 |
| |
204 #define UPROPS_WB_SHIFT 10 |
| |
205 |
| |
206 #define UPROPS_GCB_MASK 0x000003e0 |
| |
207 #define UPROPS_GCB_SHIFT 5 |
| |
208 |
| |
209 #define UPROPS_DT_MASK 0x0000001f |
| |
210 |
| |
211 /** |
| |
212 * Gets the main properties value for a code point. |
| |
213 * Implemented in uchar.c for uprops.cpp. |
| |
214 */ |
| |
215 U_CFUNC uint32_t |
| |
216 u_getMainProperties(UChar32 c); |
| |
217 |
| |
218 /** |
| |
219 * Get a properties vector word for a code point. |
| |
220 * Implemented in uchar.c for uprops.cpp. |
| |
221 * @return 0 if no data or illegal argument |
| |
222 */ |
| |
223 U_CFUNC uint32_t |
| |
224 u_getUnicodeProperties(UChar32 c, int32_t column); |
| |
225 |
| |
226 /** |
| |
227 * Get the the maximum values for some enum/int properties. |
| |
228 * Use the same column numbers as for u_getUnicodeProperties(). |
| |
229 * The returned value will contain maximum values stored in the same bit fields |
| |
230 * as where the enum values are stored in the u_getUnicodeProperties() |
| |
231 * return values for the same columns. |
| |
232 * |
| |
233 * Valid columns are those for properties words that contain enumerated values. |
| |
234 * (ICU 2.6: columns 0 and 2) |
| |
235 * For other column numbers, this function will return 0. |
| |
236 * |
| |
237 * @internal |
| |
238 */ |
| |
239 U_CFUNC int32_t |
| |
240 uprv_getMaxValues(int32_t column); |
| |
241 |
| |
242 /** |
| |
243 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. |
| |
244 * @internal |
| |
245 */ |
| |
246 U_CFUNC UBool |
| |
247 u_isalnumPOSIX(UChar32 c); |
| |
248 |
| |
249 /** |
| |
250 * Checks if c is in |
| |
251 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] |
| |
252 * with space=\p{Whitespace} and Control=Cc. |
| |
253 * Implements UCHAR_POSIX_GRAPH. |
| |
254 * @internal |
| |
255 */ |
| |
256 U_CFUNC UBool |
| |
257 u_isgraphPOSIX(UChar32 c); |
| |
258 |
| |
259 /** |
| |
260 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. |
| |
261 * Implements UCHAR_POSIX_PRINT. |
| |
262 * @internal |
| |
263 */ |
| |
264 U_CFUNC UBool |
| |
265 u_isprintPOSIX(UChar32 c); |
| |
266 |
| |
267 /** Turn a bit index into a bit flag. @internal */ |
| |
268 #define FLAG(n) ((uint32_t)1<<(n)) |
| |
269 |
| |
270 /** Flags for general categories in the order of UCharCategory. @internal */ |
| |
271 #define _Cn FLAG(U_GENERAL_OTHER_TYPES) |
| |
272 #define _Lu FLAG(U_UPPERCASE_LETTER) |
| |
273 #define _Ll FLAG(U_LOWERCASE_LETTER) |
| |
274 #define _Lt FLAG(U_TITLECASE_LETTER) |
| |
275 #define _Lm FLAG(U_MODIFIER_LETTER) |
| |
276 /* #define _Lo FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */ |
| |
277 #define _Mn FLAG(U_NON_SPACING_MARK) |
| |
278 #define _Me FLAG(U_ENCLOSING_MARK) |
| |
279 #define _Mc FLAG(U_COMBINING_SPACING_MARK) |
| |
280 #define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER) |
| |
281 #define _Nl FLAG(U_LETTER_NUMBER) |
| |
282 #define _No FLAG(U_OTHER_NUMBER) |
| |
283 #define _Zs FLAG(U_SPACE_SEPARATOR) |
| |
284 #define _Zl FLAG(U_LINE_SEPARATOR) |
| |
285 #define _Zp FLAG(U_PARAGRAPH_SEPARATOR) |
| |
286 #define _Cc FLAG(U_CONTROL_CHAR) |
| |
287 #define _Cf FLAG(U_FORMAT_CHAR) |
| |
288 #define _Co FLAG(U_PRIVATE_USE_CHAR) |
| |
289 #define _Cs FLAG(U_SURROGATE) |
| |
290 #define _Pd FLAG(U_DASH_PUNCTUATION) |
| |
291 #define _Ps FLAG(U_START_PUNCTUATION) |
| |
292 /* #define _Pe FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */ |
| |
293 /* #define _Pc FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ |
| |
294 #define _Po FLAG(U_OTHER_PUNCTUATION) |
| |
295 #define _Sm FLAG(U_MATH_SYMBOL) |
| |
296 #define _Sc FLAG(U_CURRENCY_SYMBOL) |
| |
297 #define _Sk FLAG(U_MODIFIER_SYMBOL) |
| |
298 #define _So FLAG(U_OTHER_SYMBOL) |
| |
299 #define _Pi FLAG(U_INITIAL_PUNCTUATION) |
| |
300 /* #define _Pf FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ |
| |
301 |
| |
302 /** Some code points. @internal */ |
| |
303 enum { |
| |
304 TAB =0x0009, |
| |
305 LF =0x000a, |
| |
306 FF =0x000c, |
| |
307 CR =0x000d, |
| |
308 U_A =0x0041, |
| |
309 U_F =0x0046, |
| |
310 U_Z =0x005a, |
| |
311 U_a =0x0061, |
| |
312 U_f =0x0066, |
| |
313 U_z =0x007a, |
| |
314 DEL =0x007f, |
| |
315 NL =0x0085, |
| |
316 NBSP =0x00a0, |
| |
317 CGJ =0x034f, |
| |
318 FIGURESP=0x2007, |
| |
319 HAIRSP =0x200a, |
| |
320 ZWNJ =0x200c, |
| |
321 ZWJ =0x200d, |
| |
322 RLM =0x200f, |
| |
323 NNBSP =0x202f, |
| |
324 WJ =0x2060, |
| |
325 INHSWAP =0x206a, |
| |
326 NOMDIG =0x206f, |
| |
327 U_FW_A =0xff21, |
| |
328 U_FW_F =0xff26, |
| |
329 U_FW_Z =0xff3a, |
| |
330 U_FW_a =0xff41, |
| |
331 U_FW_f =0xff46, |
| |
332 U_FW_z =0xff5a, |
| |
333 ZWNBSP =0xfeff |
| |
334 }; |
| |
335 |
| |
336 /** |
| |
337 * Get the maximum length of a (regular/1.0/extended) character name. |
| |
338 * @return 0 if no character names available. |
| |
339 */ |
| |
340 U_CAPI int32_t U_EXPORT2 |
| |
341 uprv_getMaxCharNameLength(void); |
| |
342 |
| |
343 /** |
| |
344 * Fills set with characters that are used in Unicode character names. |
| |
345 * Includes all characters that are used in regular/Unicode 1.0/extended names. |
| |
346 * Just empties the set if no character names are available. |
| |
347 * @param sa USetAdder to receive characters. |
| |
348 */ |
| |
349 U_CAPI void U_EXPORT2 |
| |
350 uprv_getCharNameCharacters(const USetAdder *sa); |
| |
351 |
| |
352 /** |
| |
353 * Constants for which data and implementation files provide which properties. |
| |
354 * Used by UnicodeSet for service-specific property enumeration. |
| |
355 * @internal |
| |
356 */ |
| |
357 enum UPropertySource { |
| |
358 /** No source, not a supported property. */ |
| |
359 UPROPS_SRC_NONE, |
| |
360 /** From uchar.c/uprops.icu main trie */ |
| |
361 UPROPS_SRC_CHAR, |
| |
362 /** From uchar.c/uprops.icu properties vectors trie */ |
| |
363 UPROPS_SRC_PROPSVEC, |
| |
364 /** From unames.c/unames.icu */ |
| |
365 UPROPS_SRC_NAMES, |
| |
366 /** From ucase.c/ucase.icu */ |
| |
367 UPROPS_SRC_CASE, |
| |
368 /** From ubidi_props.c/ubidi.icu */ |
| |
369 UPROPS_SRC_BIDI, |
| |
370 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ |
| |
371 UPROPS_SRC_CHAR_AND_PROPSVEC, |
| |
372 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ |
| |
373 UPROPS_SRC_CASE_AND_NORM, |
| |
374 /** From normalizer2impl.cpp/nfc.nrm */ |
| |
375 UPROPS_SRC_NFC, |
| |
376 /** From normalizer2impl.cpp/nfkc.nrm */ |
| |
377 UPROPS_SRC_NFKC, |
| |
378 /** From normalizer2impl.cpp/nfkc_cf.nrm */ |
| |
379 UPROPS_SRC_NFKC_CF, |
| |
380 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ |
| |
381 UPROPS_SRC_NFC_CANON_ITER, |
| |
382 /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */ |
| |
383 UPROPS_SRC_COUNT |
| |
384 }; |
| |
385 typedef enum UPropertySource UPropertySource; |
| |
386 |
| |
387 /** |
| |
388 * @see UPropertySource |
| |
389 * @internal |
| |
390 */ |
| |
391 U_CFUNC UPropertySource U_EXPORT2 |
| |
392 uprops_getSource(UProperty which); |
| |
393 |
| |
394 /** |
| |
395 * Enumerate uprops.icu's main data trie and add the |
| |
396 * start of each range of same properties to the set. |
| |
397 * @internal |
| |
398 */ |
| |
399 U_CFUNC void U_EXPORT2 |
| |
400 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
| |
401 |
| |
402 /** |
| |
403 * Enumerate uprops.icu's properties vectors trie and add the |
| |
404 * start of each range of same properties to the set. |
| |
405 * @internal |
| |
406 */ |
| |
407 U_CFUNC void U_EXPORT2 |
| |
408 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
| |
409 |
| |
410 /** |
| |
411 * Return a set of characters for property enumeration. |
| |
412 * For each two consecutive characters (start, limit) in the set, |
| |
413 * all of the properties for start..limit-1 are all the same. |
| |
414 * |
| |
415 * @param sa USetAdder to receive result. Existing contents are lost. |
| |
416 * @internal |
| |
417 */ |
| |
418 /*U_CFUNC void U_EXPORT2 |
| |
419 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); |
| |
420 */ |
| |
421 |
| |
422 /** |
| |
423 * Swap the ICU Unicode character names file. See uchar.c. |
| |
424 * @internal |
| |
425 */ |
| |
426 U_CAPI int32_t U_EXPORT2 |
| |
427 uchar_swapNames(const UDataSwapper *ds, |
| |
428 const void *inData, int32_t length, void *outData, |
| |
429 UErrorCode *pErrorCode); |
| |
430 |
| |
431 #ifdef __cplusplus |
| |
432 |
| |
433 U_NAMESPACE_BEGIN |
| |
434 |
| |
435 class UnicodeSet; |
| |
436 |
| |
437 // implemented in uniset_props.cpp |
| |
438 U_CFUNC UnicodeSet * |
| |
439 uniset_getUnicode32Instance(UErrorCode &errorCode); |
| |
440 |
| |
441 U_NAMESPACE_END |
| |
442 |
| |
443 #endif |
| |
444 |
| |
445 #endif |