intl/icu/source/common/uprops.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/uprops.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,445 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2002-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  uprops.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2002feb24
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   Constants for mostly non-core Unicode character properties
    1.20 +*   stored in uprops.icu.
    1.21 +*/
    1.22 +
    1.23 +#ifndef __UPROPS_H__
    1.24 +#define __UPROPS_H__
    1.25 +
    1.26 +#include "unicode/utypes.h"
    1.27 +#include "unicode/uset.h"
    1.28 +#include "uset_imp.h"
    1.29 +#include "udataswp.h"
    1.30 +
    1.31 +/* indexes[] entries */
    1.32 +enum {
    1.33 +    UPROPS_PROPS32_INDEX,
    1.34 +    UPROPS_EXCEPTIONS_INDEX,
    1.35 +    UPROPS_EXCEPTIONS_TOP_INDEX,
    1.36 +
    1.37 +    UPROPS_ADDITIONAL_TRIE_INDEX,
    1.38 +    UPROPS_ADDITIONAL_VECTORS_INDEX,
    1.39 +    UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
    1.40 +
    1.41 +    UPROPS_SCRIPT_EXTENSIONS_INDEX,
    1.42 +
    1.43 +    UPROPS_RESERVED_INDEX_7,
    1.44 +    UPROPS_RESERVED_INDEX_8,
    1.45 +
    1.46 +    /* size of the data file (number of 32-bit units after the header) */
    1.47 +    UPROPS_DATA_TOP_INDEX,
    1.48 +
    1.49 +    /* maximum values for code values in vector word 0 */
    1.50 +    UPROPS_MAX_VALUES_INDEX=10,
    1.51 +    /* maximum values for code values in vector word 2 */
    1.52 +    UPROPS_MAX_VALUES_2_INDEX,
    1.53 +
    1.54 +    UPROPS_INDEX_COUNT=16
    1.55 +};
    1.56 +
    1.57 +/* definitions for the main properties words */
    1.58 +enum {
    1.59 +    /* general category shift==0                                0 (5 bits) */
    1.60 +    /* reserved                                                 5 (1 bit) */
    1.61 +    UPROPS_NUMERIC_TYPE_VALUE_SHIFT=6                       /*  6 (10 bits) */
    1.62 +};
    1.63 +
    1.64 +#define GET_CATEGORY(props) ((props)&0x1f)
    1.65 +#define CAT_MASK(props) U_MASK(GET_CATEGORY(props))
    1.66 +
    1.67 +#define GET_NUMERIC_TYPE_VALUE(props) ((props)>>UPROPS_NUMERIC_TYPE_VALUE_SHIFT)
    1.68 +
    1.69 +/* constants for the storage form of numeric types and values */
    1.70 +enum {
    1.71 +    /** No numeric value. */
    1.72 +    UPROPS_NTV_NONE=0,
    1.73 +    /** Decimal digits: nv=0..9 */
    1.74 +    UPROPS_NTV_DECIMAL_START=1,
    1.75 +    /** Other digits: nv=0..9 */
    1.76 +    UPROPS_NTV_DIGIT_START=11,
    1.77 +    /** Small integers: nv=0..154 */
    1.78 +    UPROPS_NTV_NUMERIC_START=21,
    1.79 +    /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
    1.80 +    UPROPS_NTV_FRACTION_START=0xb0,
    1.81 +    /**
    1.82 +     * Large integers:
    1.83 +     * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
    1.84 +     * (only one significant decimal digit)
    1.85 +     */
    1.86 +    UPROPS_NTV_LARGE_START=0x1e0,
    1.87 +    /**
    1.88 +     * Sexagesimal numbers:
    1.89 +     * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
    1.90 +     */
    1.91 +    UPROPS_NTV_BASE60_START=0x300,
    1.92 +    /** No numeric value (yet). */
    1.93 +    UPROPS_NTV_RESERVED_START=UPROPS_NTV_BASE60_START+36,  /* 0x300+9*4=0x324 */
    1.94 +
    1.95 +    UPROPS_NTV_MAX_SMALL_INT=UPROPS_NTV_FRACTION_START-UPROPS_NTV_NUMERIC_START-1
    1.96 +};
    1.97 +
    1.98 +#define UPROPS_NTV_GET_TYPE(ntv) \
    1.99 +    ((ntv==UPROPS_NTV_NONE) ? U_NT_NONE : \
   1.100 +    (ntv<UPROPS_NTV_DIGIT_START) ?  U_NT_DECIMAL : \
   1.101 +    (ntv<UPROPS_NTV_NUMERIC_START) ? U_NT_DIGIT : \
   1.102 +    U_NT_NUMERIC)
   1.103 +
   1.104 +/* number of properties vector words */
   1.105 +#define UPROPS_VECTOR_WORDS     3
   1.106 +
   1.107 +/*
   1.108 + * Properties in vector word 0
   1.109 + * Bits
   1.110 + * 31..24   DerivedAge version major/minor one nibble each
   1.111 + * 23..22   3..1: Bits 7..0 = Script_Extensions index
   1.112 + *             3: Script value from Script_Extensions
   1.113 + *             2: Script=Inherited
   1.114 + *             1: Script=Common
   1.115 + *             0: Script=bits 7..0
   1.116 + * 21..20   reserved
   1.117 + * 19..17   East Asian Width
   1.118 + * 16.. 8   UBlockCode
   1.119 + *  7.. 0   UScriptCode, or index to Script_Extensions
   1.120 + */
   1.121 +
   1.122 +/* derived age: one nibble each for major and minor version numbers */
   1.123 +#define UPROPS_AGE_MASK         0xff000000
   1.124 +#define UPROPS_AGE_SHIFT        24
   1.125 +
   1.126 +/* Script_Extensions: mask includes Script */
   1.127 +#define UPROPS_SCRIPT_X_MASK    0x00c000ff
   1.128 +#define UPROPS_SCRIPT_X_SHIFT   22
   1.129 +
   1.130 +#define UPROPS_EA_MASK          0x000e0000
   1.131 +#define UPROPS_EA_SHIFT         17
   1.132 +
   1.133 +#define UPROPS_BLOCK_MASK       0x0001ff00
   1.134 +#define UPROPS_BLOCK_SHIFT      8
   1.135 +
   1.136 +#define UPROPS_SCRIPT_MASK      0x000000ff
   1.137 +
   1.138 +/* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
   1.139 +#define UPROPS_SCRIPT_X_WITH_COMMON     0x400000
   1.140 +#define UPROPS_SCRIPT_X_WITH_INHERITED  0x800000
   1.141 +#define UPROPS_SCRIPT_X_WITH_OTHER      0xc00000
   1.142 +
   1.143 +/*
   1.144 + * Properties in vector word 1
   1.145 + * Each bit encodes one binary property.
   1.146 + * The following constants represent the bit number, use 1<<UPROPS_XYZ.
   1.147 + * UPROPS_BINARY_1_TOP<=32!
   1.148 + *
   1.149 + * Keep this list of property enums in sync with
   1.150 + * propListNames[] in icu/source/tools/genprops/props2.c!
   1.151 + *
   1.152 + * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
   1.153 + */
   1.154 +enum {
   1.155 +    UPROPS_WHITE_SPACE,
   1.156 +    UPROPS_DASH,
   1.157 +    UPROPS_HYPHEN,
   1.158 +    UPROPS_QUOTATION_MARK,
   1.159 +    UPROPS_TERMINAL_PUNCTUATION,
   1.160 +    UPROPS_MATH,
   1.161 +    UPROPS_HEX_DIGIT,
   1.162 +    UPROPS_ASCII_HEX_DIGIT,
   1.163 +    UPROPS_ALPHABETIC,
   1.164 +    UPROPS_IDEOGRAPHIC,
   1.165 +    UPROPS_DIACRITIC,
   1.166 +    UPROPS_EXTENDER,
   1.167 +    UPROPS_NONCHARACTER_CODE_POINT,
   1.168 +    UPROPS_GRAPHEME_EXTEND,
   1.169 +    UPROPS_GRAPHEME_LINK,
   1.170 +    UPROPS_IDS_BINARY_OPERATOR,
   1.171 +    UPROPS_IDS_TRINARY_OPERATOR,
   1.172 +    UPROPS_RADICAL,
   1.173 +    UPROPS_UNIFIED_IDEOGRAPH,
   1.174 +    UPROPS_DEFAULT_IGNORABLE_CODE_POINT,
   1.175 +    UPROPS_DEPRECATED,
   1.176 +    UPROPS_LOGICAL_ORDER_EXCEPTION,
   1.177 +    UPROPS_XID_START,
   1.178 +    UPROPS_XID_CONTINUE,
   1.179 +    UPROPS_ID_START,                            /* ICU 2.6, uprops format version 3.2 */
   1.180 +    UPROPS_ID_CONTINUE,
   1.181 +    UPROPS_GRAPHEME_BASE,
   1.182 +    UPROPS_S_TERM,                              /* new in ICU 3.0 and Unicode 4.0.1 */
   1.183 +    UPROPS_VARIATION_SELECTOR,
   1.184 +    UPROPS_PATTERN_SYNTAX,                      /* new in ICU 3.4 and Unicode 4.1 */
   1.185 +    UPROPS_PATTERN_WHITE_SPACE,
   1.186 +    UPROPS_RESERVED,                            /* reserved & unused */
   1.187 +    UPROPS_BINARY_1_TOP                         /* ==32 - full! */
   1.188 +};
   1.189 +
   1.190 +/*
   1.191 + * Properties in vector word 2
   1.192 + * Bits
   1.193 + * 31..26   reserved
   1.194 + * 25..20   Line Break
   1.195 + * 19..15   Sentence Break
   1.196 + * 14..10   Word Break
   1.197 + *  9.. 5   Grapheme Cluster Break
   1.198 + *  4.. 0   Decomposition Type
   1.199 + */
   1.200 +#define UPROPS_LB_MASK          0x03f00000
   1.201 +#define UPROPS_LB_SHIFT         20
   1.202 +
   1.203 +#define UPROPS_SB_MASK          0x000f8000
   1.204 +#define UPROPS_SB_SHIFT         15
   1.205 +
   1.206 +#define UPROPS_WB_MASK          0x00007c00
   1.207 +#define UPROPS_WB_SHIFT         10
   1.208 +
   1.209 +#define UPROPS_GCB_MASK         0x000003e0
   1.210 +#define UPROPS_GCB_SHIFT        5
   1.211 +
   1.212 +#define UPROPS_DT_MASK          0x0000001f
   1.213 +
   1.214 +/**
   1.215 + * Gets the main properties value for a code point.
   1.216 + * Implemented in uchar.c for uprops.cpp.
   1.217 + */
   1.218 +U_CFUNC uint32_t
   1.219 +u_getMainProperties(UChar32 c);
   1.220 +
   1.221 +/**
   1.222 + * Get a properties vector word for a code point.
   1.223 + * Implemented in uchar.c for uprops.cpp.
   1.224 + * @return 0 if no data or illegal argument
   1.225 + */
   1.226 +U_CFUNC uint32_t
   1.227 +u_getUnicodeProperties(UChar32 c, int32_t column);
   1.228 +
   1.229 +/**
   1.230 + * Get the the maximum values for some enum/int properties.
   1.231 + * Use the same column numbers as for u_getUnicodeProperties().
   1.232 + * The returned value will contain maximum values stored in the same bit fields
   1.233 + * as where the enum values are stored in the u_getUnicodeProperties()
   1.234 + * return values for the same columns.
   1.235 + *
   1.236 + * Valid columns are those for properties words that contain enumerated values.
   1.237 + * (ICU 2.6: columns 0 and 2)
   1.238 + * For other column numbers, this function will return 0.
   1.239 + *
   1.240 + * @internal
   1.241 + */
   1.242 +U_CFUNC int32_t
   1.243 +uprv_getMaxValues(int32_t column);
   1.244 +
   1.245 +/**
   1.246 + * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
   1.247 + * @internal
   1.248 + */
   1.249 +U_CFUNC UBool
   1.250 +u_isalnumPOSIX(UChar32 c);
   1.251 +
   1.252 +/**
   1.253 + * Checks if c is in
   1.254 + * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
   1.255 + * with space=\p{Whitespace} and Control=Cc.
   1.256 + * Implements UCHAR_POSIX_GRAPH.
   1.257 + * @internal
   1.258 + */
   1.259 +U_CFUNC UBool
   1.260 +u_isgraphPOSIX(UChar32 c);
   1.261 +
   1.262 +/**
   1.263 + * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
   1.264 + * Implements UCHAR_POSIX_PRINT.
   1.265 + * @internal
   1.266 + */
   1.267 +U_CFUNC UBool
   1.268 +u_isprintPOSIX(UChar32 c);
   1.269 +
   1.270 +/** Turn a bit index into a bit flag. @internal */
   1.271 +#define FLAG(n) ((uint32_t)1<<(n))
   1.272 +
   1.273 +/** Flags for general categories in the order of UCharCategory. @internal */
   1.274 +#define _Cn     FLAG(U_GENERAL_OTHER_TYPES)
   1.275 +#define _Lu     FLAG(U_UPPERCASE_LETTER)
   1.276 +#define _Ll     FLAG(U_LOWERCASE_LETTER)
   1.277 +#define _Lt     FLAG(U_TITLECASE_LETTER)
   1.278 +#define _Lm     FLAG(U_MODIFIER_LETTER)
   1.279 +/* #define _Lo     FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */
   1.280 +#define _Mn     FLAG(U_NON_SPACING_MARK)
   1.281 +#define _Me     FLAG(U_ENCLOSING_MARK)
   1.282 +#define _Mc     FLAG(U_COMBINING_SPACING_MARK)
   1.283 +#define _Nd     FLAG(U_DECIMAL_DIGIT_NUMBER)
   1.284 +#define _Nl     FLAG(U_LETTER_NUMBER)
   1.285 +#define _No     FLAG(U_OTHER_NUMBER)
   1.286 +#define _Zs     FLAG(U_SPACE_SEPARATOR)
   1.287 +#define _Zl     FLAG(U_LINE_SEPARATOR)
   1.288 +#define _Zp     FLAG(U_PARAGRAPH_SEPARATOR)
   1.289 +#define _Cc     FLAG(U_CONTROL_CHAR)
   1.290 +#define _Cf     FLAG(U_FORMAT_CHAR)
   1.291 +#define _Co     FLAG(U_PRIVATE_USE_CHAR)
   1.292 +#define _Cs     FLAG(U_SURROGATE)
   1.293 +#define _Pd     FLAG(U_DASH_PUNCTUATION)
   1.294 +#define _Ps     FLAG(U_START_PUNCTUATION)
   1.295 +/* #define _Pe     FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */
   1.296 +/* #define _Pc     FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
   1.297 +#define _Po     FLAG(U_OTHER_PUNCTUATION)
   1.298 +#define _Sm     FLAG(U_MATH_SYMBOL)
   1.299 +#define _Sc     FLAG(U_CURRENCY_SYMBOL)
   1.300 +#define _Sk     FLAG(U_MODIFIER_SYMBOL)
   1.301 +#define _So     FLAG(U_OTHER_SYMBOL)
   1.302 +#define _Pi     FLAG(U_INITIAL_PUNCTUATION)
   1.303 +/* #define _Pf     FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
   1.304 +
   1.305 +/** Some code points. @internal */
   1.306 +enum {
   1.307 +    TAB     =0x0009,
   1.308 +    LF      =0x000a,
   1.309 +    FF      =0x000c,
   1.310 +    CR      =0x000d,
   1.311 +    U_A     =0x0041,
   1.312 +    U_F     =0x0046,
   1.313 +    U_Z     =0x005a,
   1.314 +    U_a     =0x0061,
   1.315 +    U_f     =0x0066,
   1.316 +    U_z     =0x007a,
   1.317 +    DEL     =0x007f,
   1.318 +    NL      =0x0085,
   1.319 +    NBSP    =0x00a0,
   1.320 +    CGJ     =0x034f,
   1.321 +    FIGURESP=0x2007,
   1.322 +    HAIRSP  =0x200a,
   1.323 +    ZWNJ    =0x200c,
   1.324 +    ZWJ     =0x200d,
   1.325 +    RLM     =0x200f,
   1.326 +    NNBSP   =0x202f,
   1.327 +    WJ      =0x2060,
   1.328 +    INHSWAP =0x206a,
   1.329 +    NOMDIG  =0x206f,
   1.330 +    U_FW_A  =0xff21,
   1.331 +    U_FW_F  =0xff26,
   1.332 +    U_FW_Z  =0xff3a,
   1.333 +    U_FW_a  =0xff41,
   1.334 +    U_FW_f  =0xff46,
   1.335 +    U_FW_z  =0xff5a,
   1.336 +    ZWNBSP  =0xfeff
   1.337 +};
   1.338 +
   1.339 +/**
   1.340 + * Get the maximum length of a (regular/1.0/extended) character name.
   1.341 + * @return 0 if no character names available.
   1.342 + */
   1.343 +U_CAPI int32_t U_EXPORT2
   1.344 +uprv_getMaxCharNameLength(void);
   1.345 +
   1.346 +/**
   1.347 + * Fills set with characters that are used in Unicode character names.
   1.348 + * Includes all characters that are used in regular/Unicode 1.0/extended names.
   1.349 + * Just empties the set if no character names are available.
   1.350 + * @param sa USetAdder to receive characters.
   1.351 + */
   1.352 +U_CAPI void U_EXPORT2
   1.353 +uprv_getCharNameCharacters(const USetAdder *sa);
   1.354 +
   1.355 +/**
   1.356 + * Constants for which data and implementation files provide which properties.
   1.357 + * Used by UnicodeSet for service-specific property enumeration.
   1.358 + * @internal
   1.359 + */
   1.360 +enum UPropertySource {
   1.361 +    /** No source, not a supported property. */
   1.362 +    UPROPS_SRC_NONE,
   1.363 +    /** From uchar.c/uprops.icu main trie */
   1.364 +    UPROPS_SRC_CHAR,
   1.365 +    /** From uchar.c/uprops.icu properties vectors trie */
   1.366 +    UPROPS_SRC_PROPSVEC,
   1.367 +    /** From unames.c/unames.icu */
   1.368 +    UPROPS_SRC_NAMES,
   1.369 +    /** From ucase.c/ucase.icu */
   1.370 +    UPROPS_SRC_CASE,
   1.371 +    /** From ubidi_props.c/ubidi.icu */
   1.372 +    UPROPS_SRC_BIDI,
   1.373 +    /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
   1.374 +    UPROPS_SRC_CHAR_AND_PROPSVEC,
   1.375 +    /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
   1.376 +    UPROPS_SRC_CASE_AND_NORM,
   1.377 +    /** From normalizer2impl.cpp/nfc.nrm */
   1.378 +    UPROPS_SRC_NFC,
   1.379 +    /** From normalizer2impl.cpp/nfkc.nrm */
   1.380 +    UPROPS_SRC_NFKC,
   1.381 +    /** From normalizer2impl.cpp/nfkc_cf.nrm */
   1.382 +    UPROPS_SRC_NFKC_CF,
   1.383 +    /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
   1.384 +    UPROPS_SRC_NFC_CANON_ITER,
   1.385 +    /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
   1.386 +    UPROPS_SRC_COUNT
   1.387 +};
   1.388 +typedef enum UPropertySource UPropertySource;
   1.389 +
   1.390 +/**
   1.391 + * @see UPropertySource
   1.392 + * @internal
   1.393 + */
   1.394 +U_CFUNC UPropertySource U_EXPORT2
   1.395 +uprops_getSource(UProperty which);
   1.396 +
   1.397 +/**
   1.398 + * Enumerate uprops.icu's main data trie and add the
   1.399 + * start of each range of same properties to the set.
   1.400 + * @internal
   1.401 + */
   1.402 +U_CFUNC void U_EXPORT2
   1.403 +uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
   1.404 +
   1.405 +/**
   1.406 + * Enumerate uprops.icu's properties vectors trie and add the
   1.407 + * start of each range of same properties to the set.
   1.408 + * @internal
   1.409 + */
   1.410 +U_CFUNC void U_EXPORT2
   1.411 +upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
   1.412 +
   1.413 +/**
   1.414 + * Return a set of characters for property enumeration.
   1.415 + * For each two consecutive characters (start, limit) in the set,
   1.416 + * all of the properties for start..limit-1 are all the same.
   1.417 + *
   1.418 + * @param sa USetAdder to receive result. Existing contents are lost.
   1.419 + * @internal
   1.420 + */
   1.421 +/*U_CFUNC void U_EXPORT2
   1.422 +uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
   1.423 +*/
   1.424 +
   1.425 +/**
   1.426 + * Swap the ICU Unicode character names file. See uchar.c.
   1.427 + * @internal
   1.428 + */
   1.429 +U_CAPI int32_t U_EXPORT2
   1.430 +uchar_swapNames(const UDataSwapper *ds,
   1.431 +                const void *inData, int32_t length, void *outData,
   1.432 +                UErrorCode *pErrorCode);
   1.433 +
   1.434 +#ifdef __cplusplus
   1.435 +
   1.436 +U_NAMESPACE_BEGIN
   1.437 +
   1.438 +class UnicodeSet;
   1.439 +
   1.440 +// implemented in uniset_props.cpp
   1.441 +U_CFUNC UnicodeSet *
   1.442 +uniset_getUnicode32Instance(UErrorCode &errorCode);
   1.443 +
   1.444 +U_NAMESPACE_END
   1.445 +
   1.446 +#endif
   1.447 +
   1.448 +#endif

mercurial