1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/uprops.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,445 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2002-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: uprops.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2002feb24 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Constants for mostly non-core Unicode character properties 1.20 +* stored in uprops.icu. 1.21 +*/ 1.22 + 1.23 +#ifndef __UPROPS_H__ 1.24 +#define __UPROPS_H__ 1.25 + 1.26 +#include "unicode/utypes.h" 1.27 +#include "unicode/uset.h" 1.28 +#include "uset_imp.h" 1.29 +#include "udataswp.h" 1.30 + 1.31 +/* indexes[] entries */ 1.32 +enum { 1.33 + UPROPS_PROPS32_INDEX, 1.34 + UPROPS_EXCEPTIONS_INDEX, 1.35 + UPROPS_EXCEPTIONS_TOP_INDEX, 1.36 + 1.37 + UPROPS_ADDITIONAL_TRIE_INDEX, 1.38 + UPROPS_ADDITIONAL_VECTORS_INDEX, 1.39 + UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, 1.40 + 1.41 + UPROPS_SCRIPT_EXTENSIONS_INDEX, 1.42 + 1.43 + UPROPS_RESERVED_INDEX_7, 1.44 + UPROPS_RESERVED_INDEX_8, 1.45 + 1.46 + /* size of the data file (number of 32-bit units after the header) */ 1.47 + UPROPS_DATA_TOP_INDEX, 1.48 + 1.49 + /* maximum values for code values in vector word 0 */ 1.50 + UPROPS_MAX_VALUES_INDEX=10, 1.51 + /* maximum values for code values in vector word 2 */ 1.52 + UPROPS_MAX_VALUES_2_INDEX, 1.53 + 1.54 + UPROPS_INDEX_COUNT=16 1.55 +}; 1.56 + 1.57 +/* definitions for the main properties words */ 1.58 +enum { 1.59 + /* general category shift==0 0 (5 bits) */ 1.60 + /* reserved 5 (1 bit) */ 1.61 + UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6 /* 6 (10 bits) */ 1.62 +}; 1.63 + 1.64 +#define GET_CATEGORY(props) ((props)&0x1f) 1.65 +#define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) 1.66 + 1.67 +#define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT) 1.68 + 1.69 +/* constants for the storage form of numeric types and values */ 1.70 +enum { 1.71 + /** No numeric value. */ 1.72 + UPROPS_NTV_NONE=0, 1.73 + /** Decimal digits: nv=0..9 */ 1.74 + UPROPS_NTV_DECIMAL_START=1, 1.75 + /** Other digits: nv=0..9 */ 1.76 + UPROPS_NTV_DIGIT_START=11, 1.77 + /** Small integers: nv=0..154 */ 1.78 + UPROPS_NTV_NUMERIC_START=21, 1.79 + /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1.80 + UPROPS_NTV_FRACTION_START=0xb0, 1.81 + /** 1.82 + * Large integers: 1.83 + * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1.84 + * (only one significant decimal digit) 1.85 + */ 1.86 + UPROPS_NTV_LARGE_START=0x1e0, 1.87 + /** 1.88 + * Sexagesimal numbers: 1.89 + * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1.90 + */ 1.91 + UPROPS_NTV_BASE60_START=0x300, 1.92 + /** No numeric value (yet). */ 1.93 + UPROPS_NTV_RESERVED_START=UPROPS_NTV_BASE60_START+36, /* 0x300+9*4=0x324 */ 1.94 + 1.95 + UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1 1.96 +}; 1.97 + 1.98 +#define UPROPS_NTV_GET_TYPE(ntv) \ 1.99 + ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \ 1.100 + (ntv<UPROPS_NTV_DIGIT_START) ? U_NT_DECIMAL : \ 1.101 + (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \ 1.102 + U_NT_NUMERIC) 1.103 + 1.104 +/* number of properties vector words */ 1.105 +#define UPROPS_VECTOR_WORDS 3 1.106 + 1.107 +/* 1.108 + * Properties in vector word 0 1.109 + * Bits 1.110 + * 31..24 DerivedAge version major/minor one nibble each 1.111 + * 23..22 3..1: Bits 7..0 = Script_Extensions index 1.112 + * 3: Script value from Script_Extensions 1.113 + * 2: Script=Inherited 1.114 + * 1: Script=Common 1.115 + * 0: Script=bits 7..0 1.116 + * 21..20 reserved 1.117 + * 19..17 East Asian Width 1.118 + * 16.. 8 UBlockCode 1.119 + * 7.. 0 UScriptCode, or index to Script_Extensions 1.120 + */ 1.121 + 1.122 +/* derived age: one nibble each for major and minor version numbers */ 1.123 +#define UPROPS_AGE_MASK 0xff000000 1.124 +#define UPROPS_AGE_SHIFT 24 1.125 + 1.126 +/* Script_Extensions: mask includes Script */ 1.127 +#define UPROPS_SCRIPT_X_MASK 0x00c000ff 1.128 +#define UPROPS_SCRIPT_X_SHIFT 22 1.129 + 1.130 +#define UPROPS_EA_MASK 0x000e0000 1.131 +#define UPROPS_EA_SHIFT 17 1.132 + 1.133 +#define UPROPS_BLOCK_MASK 0x0001ff00 1.134 +#define UPROPS_BLOCK_SHIFT 8 1.135 + 1.136 +#define UPROPS_SCRIPT_MASK 0x000000ff 1.137 + 1.138 +/* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1.139 +#define UPROPS_SCRIPT_X_WITH_COMMON 0x400000 1.140 +#define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000 1.141 +#define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000 1.142 + 1.143 +/* 1.144 + * Properties in vector word 1 1.145 + * Each bit encodes one binary property. 1.146 + * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1.147 + * UPROPS_BINARY_1_TOP<=32! 1.148 + * 1.149 + * Keep this list of property enums in sync with 1.150 + * propListNames[] in icu/source/tools/genprops/props2.c! 1.151 + * 1.152 + * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1.153 + */ 1.154 +enum { 1.155 + UPROPS_WHITE_SPACE, 1.156 + UPROPS_DASH, 1.157 + UPROPS_HYPHEN, 1.158 + UPROPS_QUOTATION_MARK, 1.159 + UPROPS_TERMINAL_PUNCTUATION, 1.160 + UPROPS_MATH, 1.161 + UPROPS_HEX_DIGIT, 1.162 + UPROPS_ASCII_HEX_DIGIT, 1.163 + UPROPS_ALPHABETIC, 1.164 + UPROPS_IDEOGRAPHIC, 1.165 + UPROPS_DIACRITIC, 1.166 + UPROPS_EXTENDER, 1.167 + UPROPS_NONCHARACTER_CODE_POINT, 1.168 + UPROPS_GRAPHEME_EXTEND, 1.169 + UPROPS_GRAPHEME_LINK, 1.170 + UPROPS_IDS_BINARY_OPERATOR, 1.171 + UPROPS_IDS_TRINARY_OPERATOR, 1.172 + UPROPS_RADICAL, 1.173 + UPROPS_UNIFIED_IDEOGRAPH, 1.174 + UPROPS_DEFAULT_IGNORABLE_CODE_POINT, 1.175 + UPROPS_DEPRECATED, 1.176 + UPROPS_LOGICAL_ORDER_EXCEPTION, 1.177 + UPROPS_XID_START, 1.178 + UPROPS_XID_CONTINUE, 1.179 + UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ 1.180 + UPROPS_ID_CONTINUE, 1.181 + UPROPS_GRAPHEME_BASE, 1.182 + UPROPS_S_TERM, /* new in ICU 3.0 and Unicode 4.0.1 */ 1.183 + UPROPS_VARIATION_SELECTOR, 1.184 + UPROPS_PATTERN_SYNTAX, /* new in ICU 3.4 and Unicode 4.1 */ 1.185 + UPROPS_PATTERN_WHITE_SPACE, 1.186 + UPROPS_RESERVED, /* reserved & unused */ 1.187 + UPROPS_BINARY_1_TOP /* ==32 - full! */ 1.188 +}; 1.189 + 1.190 +/* 1.191 + * Properties in vector word 2 1.192 + * Bits 1.193 + * 31..26 reserved 1.194 + * 25..20 Line Break 1.195 + * 19..15 Sentence Break 1.196 + * 14..10 Word Break 1.197 + * 9.. 5 Grapheme Cluster Break 1.198 + * 4.. 0 Decomposition Type 1.199 + */ 1.200 +#define UPROPS_LB_MASK 0x03f00000 1.201 +#define UPROPS_LB_SHIFT 20 1.202 + 1.203 +#define UPROPS_SB_MASK 0x000f8000 1.204 +#define UPROPS_SB_SHIFT 15 1.205 + 1.206 +#define UPROPS_WB_MASK 0x00007c00 1.207 +#define UPROPS_WB_SHIFT 10 1.208 + 1.209 +#define UPROPS_GCB_MASK 0x000003e0 1.210 +#define UPROPS_GCB_SHIFT 5 1.211 + 1.212 +#define UPROPS_DT_MASK 0x0000001f 1.213 + 1.214 +/** 1.215 + * Gets the main properties value for a code point. 1.216 + * Implemented in uchar.c for uprops.cpp. 1.217 + */ 1.218 +U_CFUNC uint32_t 1.219 +u_getMainProperties(UChar32 c); 1.220 + 1.221 +/** 1.222 + * Get a properties vector word for a code point. 1.223 + * Implemented in uchar.c for uprops.cpp. 1.224 + * @return 0 if no data or illegal argument 1.225 + */ 1.226 +U_CFUNC uint32_t 1.227 +u_getUnicodeProperties(UChar32 c, int32_t column); 1.228 + 1.229 +/** 1.230 + * Get the the maximum values for some enum/int properties. 1.231 + * Use the same column numbers as for u_getUnicodeProperties(). 1.232 + * The returned value will contain maximum values stored in the same bit fields 1.233 + * as where the enum values are stored in the u_getUnicodeProperties() 1.234 + * return values for the same columns. 1.235 + * 1.236 + * Valid columns are those for properties words that contain enumerated values. 1.237 + * (ICU 2.6: columns 0 and 2) 1.238 + * For other column numbers, this function will return 0. 1.239 + * 1.240 + * @internal 1.241 + */ 1.242 +U_CFUNC int32_t 1.243 +uprv_getMaxValues(int32_t column); 1.244 + 1.245 +/** 1.246 + * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 1.247 + * @internal 1.248 + */ 1.249 +U_CFUNC UBool 1.250 +u_isalnumPOSIX(UChar32 c); 1.251 + 1.252 +/** 1.253 + * Checks if c is in 1.254 + * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 1.255 + * with space=\p{Whitespace} and Control=Cc. 1.256 + * Implements UCHAR_POSIX_GRAPH. 1.257 + * @internal 1.258 + */ 1.259 +U_CFUNC UBool 1.260 +u_isgraphPOSIX(UChar32 c); 1.261 + 1.262 +/** 1.263 + * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 1.264 + * Implements UCHAR_POSIX_PRINT. 1.265 + * @internal 1.266 + */ 1.267 +U_CFUNC UBool 1.268 +u_isprintPOSIX(UChar32 c); 1.269 + 1.270 +/** Turn a bit index into a bit flag. @internal */ 1.271 +#define FLAG(n) ((uint32_t)1<<(n)) 1.272 + 1.273 +/** Flags for general categories in the order of UCharCategory. @internal */ 1.274 +#define _Cn FLAG(U_GENERAL_OTHER_TYPES) 1.275 +#define _Lu FLAG(U_UPPERCASE_LETTER) 1.276 +#define _Ll FLAG(U_LOWERCASE_LETTER) 1.277 +#define _Lt FLAG(U_TITLECASE_LETTER) 1.278 +#define _Lm FLAG(U_MODIFIER_LETTER) 1.279 +/* #define _Lo FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */ 1.280 +#define _Mn FLAG(U_NON_SPACING_MARK) 1.281 +#define _Me FLAG(U_ENCLOSING_MARK) 1.282 +#define _Mc FLAG(U_COMBINING_SPACING_MARK) 1.283 +#define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER) 1.284 +#define _Nl FLAG(U_LETTER_NUMBER) 1.285 +#define _No FLAG(U_OTHER_NUMBER) 1.286 +#define _Zs FLAG(U_SPACE_SEPARATOR) 1.287 +#define _Zl FLAG(U_LINE_SEPARATOR) 1.288 +#define _Zp FLAG(U_PARAGRAPH_SEPARATOR) 1.289 +#define _Cc FLAG(U_CONTROL_CHAR) 1.290 +#define _Cf FLAG(U_FORMAT_CHAR) 1.291 +#define _Co FLAG(U_PRIVATE_USE_CHAR) 1.292 +#define _Cs FLAG(U_SURROGATE) 1.293 +#define _Pd FLAG(U_DASH_PUNCTUATION) 1.294 +#define _Ps FLAG(U_START_PUNCTUATION) 1.295 +/* #define _Pe FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */ 1.296 +/* #define _Pc FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ 1.297 +#define _Po FLAG(U_OTHER_PUNCTUATION) 1.298 +#define _Sm FLAG(U_MATH_SYMBOL) 1.299 +#define _Sc FLAG(U_CURRENCY_SYMBOL) 1.300 +#define _Sk FLAG(U_MODIFIER_SYMBOL) 1.301 +#define _So FLAG(U_OTHER_SYMBOL) 1.302 +#define _Pi FLAG(U_INITIAL_PUNCTUATION) 1.303 +/* #define _Pf FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */ 1.304 + 1.305 +/** Some code points. @internal */ 1.306 +enum { 1.307 + TAB =0x0009, 1.308 + LF =0x000a, 1.309 + FF =0x000c, 1.310 + CR =0x000d, 1.311 + U_A =0x0041, 1.312 + U_F =0x0046, 1.313 + U_Z =0x005a, 1.314 + U_a =0x0061, 1.315 + U_f =0x0066, 1.316 + U_z =0x007a, 1.317 + DEL =0x007f, 1.318 + NL =0x0085, 1.319 + NBSP =0x00a0, 1.320 + CGJ =0x034f, 1.321 + FIGURESP=0x2007, 1.322 + HAIRSP =0x200a, 1.323 + ZWNJ =0x200c, 1.324 + ZWJ =0x200d, 1.325 + RLM =0x200f, 1.326 + NNBSP =0x202f, 1.327 + WJ =0x2060, 1.328 + INHSWAP =0x206a, 1.329 + NOMDIG =0x206f, 1.330 + U_FW_A =0xff21, 1.331 + U_FW_F =0xff26, 1.332 + U_FW_Z =0xff3a, 1.333 + U_FW_a =0xff41, 1.334 + U_FW_f =0xff46, 1.335 + U_FW_z =0xff5a, 1.336 + ZWNBSP =0xfeff 1.337 +}; 1.338 + 1.339 +/** 1.340 + * Get the maximum length of a (regular/1.0/extended) character name. 1.341 + * @return 0 if no character names available. 1.342 + */ 1.343 +U_CAPI int32_t U_EXPORT2 1.344 +uprv_getMaxCharNameLength(void); 1.345 + 1.346 +/** 1.347 + * Fills set with characters that are used in Unicode character names. 1.348 + * Includes all characters that are used in regular/Unicode 1.0/extended names. 1.349 + * Just empties the set if no character names are available. 1.350 + * @param sa USetAdder to receive characters. 1.351 + */ 1.352 +U_CAPI void U_EXPORT2 1.353 +uprv_getCharNameCharacters(const USetAdder *sa); 1.354 + 1.355 +/** 1.356 + * Constants for which data and implementation files provide which properties. 1.357 + * Used by UnicodeSet for service-specific property enumeration. 1.358 + * @internal 1.359 + */ 1.360 +enum UPropertySource { 1.361 + /** No source, not a supported property. */ 1.362 + UPROPS_SRC_NONE, 1.363 + /** From uchar.c/uprops.icu main trie */ 1.364 + UPROPS_SRC_CHAR, 1.365 + /** From uchar.c/uprops.icu properties vectors trie */ 1.366 + UPROPS_SRC_PROPSVEC, 1.367 + /** From unames.c/unames.icu */ 1.368 + UPROPS_SRC_NAMES, 1.369 + /** From ucase.c/ucase.icu */ 1.370 + UPROPS_SRC_CASE, 1.371 + /** From ubidi_props.c/ubidi.icu */ 1.372 + UPROPS_SRC_BIDI, 1.373 + /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 1.374 + UPROPS_SRC_CHAR_AND_PROPSVEC, 1.375 + /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 1.376 + UPROPS_SRC_CASE_AND_NORM, 1.377 + /** From normalizer2impl.cpp/nfc.nrm */ 1.378 + UPROPS_SRC_NFC, 1.379 + /** From normalizer2impl.cpp/nfkc.nrm */ 1.380 + UPROPS_SRC_NFKC, 1.381 + /** From normalizer2impl.cpp/nfkc_cf.nrm */ 1.382 + UPROPS_SRC_NFKC_CF, 1.383 + /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 1.384 + UPROPS_SRC_NFC_CANON_ITER, 1.385 + /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */ 1.386 + UPROPS_SRC_COUNT 1.387 +}; 1.388 +typedef enum UPropertySource UPropertySource; 1.389 + 1.390 +/** 1.391 + * @see UPropertySource 1.392 + * @internal 1.393 + */ 1.394 +U_CFUNC UPropertySource U_EXPORT2 1.395 +uprops_getSource(UProperty which); 1.396 + 1.397 +/** 1.398 + * Enumerate uprops.icu's main data trie and add the 1.399 + * start of each range of same properties to the set. 1.400 + * @internal 1.401 + */ 1.402 +U_CFUNC void U_EXPORT2 1.403 +uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); 1.404 + 1.405 +/** 1.406 + * Enumerate uprops.icu's properties vectors trie and add the 1.407 + * start of each range of same properties to the set. 1.408 + * @internal 1.409 + */ 1.410 +U_CFUNC void U_EXPORT2 1.411 +upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); 1.412 + 1.413 +/** 1.414 + * Return a set of characters for property enumeration. 1.415 + * For each two consecutive characters (start, limit) in the set, 1.416 + * all of the properties for start..limit-1 are all the same. 1.417 + * 1.418 + * @param sa USetAdder to receive result. Existing contents are lost. 1.419 + * @internal 1.420 + */ 1.421 +/*U_CFUNC void U_EXPORT2 1.422 +uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); 1.423 +*/ 1.424 + 1.425 +/** 1.426 + * Swap the ICU Unicode character names file. See uchar.c. 1.427 + * @internal 1.428 + */ 1.429 +U_CAPI int32_t U_EXPORT2 1.430 +uchar_swapNames(const UDataSwapper *ds, 1.431 + const void *inData, int32_t length, void *outData, 1.432 + UErrorCode *pErrorCode); 1.433 + 1.434 +#ifdef __cplusplus 1.435 + 1.436 +U_NAMESPACE_BEGIN 1.437 + 1.438 +class UnicodeSet; 1.439 + 1.440 +// implemented in uniset_props.cpp 1.441 +U_CFUNC UnicodeSet * 1.442 +uniset_getUnicode32Instance(UErrorCode &errorCode); 1.443 + 1.444 +U_NAMESPACE_END 1.445 + 1.446 +#endif 1.447 + 1.448 +#endif