intl/icu/source/common/ucase.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucase.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,409 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2004-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucase.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2004aug30
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   Low-level Unicode character/string case mapping code.
    1.20 +*/
    1.21 +
    1.22 +#ifndef __UCASE_H__
    1.23 +#define __UCASE_H__
    1.24 +
    1.25 +#include "unicode/utypes.h"
    1.26 +#include "unicode/uset.h"
    1.27 +#include "putilimp.h"
    1.28 +#include "uset_imp.h"
    1.29 +#include "udataswp.h"
    1.30 +
    1.31 +#ifdef __cplusplus
    1.32 +U_NAMESPACE_BEGIN
    1.33 +
    1.34 +class UnicodeString;
    1.35 +
    1.36 +U_NAMESPACE_END
    1.37 +#endif
    1.38 +
    1.39 +/* library API -------------------------------------------------------------- */
    1.40 +
    1.41 +U_CDECL_BEGIN
    1.42 +
    1.43 +struct UCaseProps;
    1.44 +typedef struct UCaseProps UCaseProps;
    1.45 +
    1.46 +U_CDECL_END
    1.47 +
    1.48 +U_CAPI const UCaseProps * U_EXPORT2
    1.49 +ucase_getSingleton(void);
    1.50 +
    1.51 +U_CFUNC void U_EXPORT2
    1.52 +ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
    1.53 +
    1.54 +/**
    1.55 + * Requires non-NULL locale ID but otherwise does the equivalent of
    1.56 + * checking for language codes as if uloc_getLanguage() were called:
    1.57 + * Accepts both 2- and 3-letter codes and accepts case variants.
    1.58 + */
    1.59 +U_CFUNC int32_t
    1.60 +ucase_getCaseLocale(const char *locale, int32_t *locCache);
    1.61 +
    1.62 +/* Casing locale types for ucase_getCaseLocale */
    1.63 +enum {
    1.64 +    UCASE_LOC_UNKNOWN,
    1.65 +    UCASE_LOC_ROOT,
    1.66 +    UCASE_LOC_TURKISH,
    1.67 +    UCASE_LOC_LITHUANIAN,
    1.68 +    UCASE_LOC_DUTCH
    1.69 +};
    1.70 +
    1.71 +/**
    1.72 + * Bit mask for getting just the options from a string compare options word
    1.73 + * that are relevant for case-insensitive string comparison.
    1.74 + * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
    1.75 + * @internal
    1.76 + */
    1.77 +#define _STRCASECMP_OPTIONS_MASK 0xffff
    1.78 +
    1.79 +/**
    1.80 + * Bit mask for getting just the options from a string compare options word
    1.81 + * that are relevant for case folding (of a single string or code point).
    1.82 + * See uchar.h.
    1.83 + * @internal
    1.84 + */
    1.85 +#define _FOLD_CASE_OPTIONS_MASK 0xff
    1.86 +
    1.87 +/* single-code point functions */
    1.88 +
    1.89 +U_CAPI UChar32 U_EXPORT2
    1.90 +ucase_tolower(const UCaseProps *csp, UChar32 c);
    1.91 +
    1.92 +U_CAPI UChar32 U_EXPORT2
    1.93 +ucase_toupper(const UCaseProps *csp, UChar32 c);
    1.94 +
    1.95 +U_CAPI UChar32 U_EXPORT2
    1.96 +ucase_totitle(const UCaseProps *csp, UChar32 c);
    1.97 +
    1.98 +U_CAPI UChar32 U_EXPORT2
    1.99 +ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
   1.100 +
   1.101 +/**
   1.102 + * Adds all simple case mappings and the full case folding for c to sa,
   1.103 + * and also adds special case closure mappings.
   1.104 + * c itself is not added.
   1.105 + * For example, the mappings
   1.106 + * - for s include long s
   1.107 + * - for sharp s include ss
   1.108 + * - for k include the Kelvin sign
   1.109 + */
   1.110 +U_CFUNC void U_EXPORT2
   1.111 +ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
   1.112 +
   1.113 +/**
   1.114 + * Maps the string to single code points and adds the associated case closure
   1.115 + * mappings.
   1.116 + * The string is mapped to code points if it is their full case folding string.
   1.117 + * In other words, this performs a reverse full case folding and then
   1.118 + * adds the case closure items of the resulting code points.
   1.119 + * If the string is found and its closure applied, then
   1.120 + * the string itself is added as well as part of its code points' closure.
   1.121 + * It must be length>=0.
   1.122 + *
   1.123 + * @return TRUE if the string was found
   1.124 + */
   1.125 +U_CFUNC UBool U_EXPORT2
   1.126 +ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
   1.127 +
   1.128 +#ifdef __cplusplus
   1.129 +U_NAMESPACE_BEGIN
   1.130 +
   1.131 +/**
   1.132 + * Iterator over characters with more than one code point in the full default Case_Folding.
   1.133 + */
   1.134 +class U_COMMON_API FullCaseFoldingIterator {
   1.135 +public:
   1.136 +    /** Constructor. */
   1.137 +    FullCaseFoldingIterator();
   1.138 +    /**
   1.139 +     * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
   1.140 +     * Returns a negative cp value at the end of the iteration.
   1.141 +     */
   1.142 +    UChar32 next(UnicodeString &full);
   1.143 +private:
   1.144 +    FullCaseFoldingIterator(const FullCaseFoldingIterator &);  // no copy
   1.145 +    FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &);  // no assignment
   1.146 +
   1.147 +    const UChar *unfold;
   1.148 +    int32_t unfoldRows;
   1.149 +    int32_t unfoldRowWidth;
   1.150 +    int32_t unfoldStringWidth;
   1.151 +    int32_t currentRow;
   1.152 +    int32_t rowCpIndex;
   1.153 +};
   1.154 +
   1.155 +U_NAMESPACE_END
   1.156 +#endif
   1.157 +
   1.158 +/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
   1.159 +U_CAPI int32_t U_EXPORT2
   1.160 +ucase_getType(const UCaseProps *csp, UChar32 c);
   1.161 +
   1.162 +/** @return same as ucase_getType(), or <0 if c is case-ignorable */
   1.163 +U_CAPI int32_t U_EXPORT2
   1.164 +ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
   1.165 +
   1.166 +U_CAPI UBool U_EXPORT2
   1.167 +ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
   1.168 +
   1.169 +U_CAPI UBool U_EXPORT2
   1.170 +ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
   1.171 +
   1.172 +/* string case mapping functions */
   1.173 +
   1.174 +U_CDECL_BEGIN
   1.175 +
   1.176 +/**
   1.177 + * Iterator function for string case mappings, which need to look at the
   1.178 + * context (surrounding text) of a given character for conditional mappings.
   1.179 + *
   1.180 + * The iterator only needs to go backward or forward away from the
   1.181 + * character in question. It does not use any indexes on this interface.
   1.182 + * It does not support random access or an arbitrary change of
   1.183 + * iteration direction.
   1.184 + *
   1.185 + * The code point being case-mapped itself is never returned by
   1.186 + * this iterator.
   1.187 + *
   1.188 + * @param context A pointer to the iterator's working data.
   1.189 + * @param dir If <0 then start iterating backward from the character;
   1.190 + *            if >0 then start iterating forward from the character;
   1.191 + *            if 0 then continue iterating in the current direction.
   1.192 + * @return Next code point, or <0 when the iteration is done.
   1.193 + */
   1.194 +typedef UChar32 U_CALLCONV
   1.195 +UCaseContextIterator(void *context, int8_t dir);
   1.196 +
   1.197 +/**
   1.198 + * Sample struct which may be used by some implementations of
   1.199 + * UCaseContextIterator.
   1.200 + */
   1.201 +struct UCaseContext {
   1.202 +    void *p;
   1.203 +    int32_t start, index, limit;
   1.204 +    int32_t cpStart, cpLimit;
   1.205 +    int8_t dir;
   1.206 +    int8_t b1, b2, b3;
   1.207 +};
   1.208 +typedef struct UCaseContext UCaseContext;
   1.209 +
   1.210 +U_CDECL_END
   1.211 +
   1.212 +#define UCASECONTEXT_INITIALIZER { NULL,  0, 0, 0,  0, 0,  0,  0, 0, 0 }
   1.213 +
   1.214 +enum {
   1.215 +    /**
   1.216 +     * For string case mappings, a single character (a code point) is mapped
   1.217 +     * either to itself (in which case in-place mapping functions do nothing),
   1.218 +     * or to another single code point, or to a string.
   1.219 +     * Aside from the string contents, these are indicated with a single int32_t
   1.220 +     * value as follows:
   1.221 +     *
   1.222 +     * Mapping to self: Negative values (~self instead of -self to support U+0000)
   1.223 +     *
   1.224 +     * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
   1.225 +     *
   1.226 +     * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
   1.227 +     * returned. Note that the string result may indeed have zero length.
   1.228 +     */
   1.229 +    UCASE_MAX_STRING_LENGTH=0x1f
   1.230 +};
   1.231 +
   1.232 +/**
   1.233 + * Get the full lowercase mapping for c.
   1.234 + *
   1.235 + * @param csp Case mapping properties.
   1.236 + * @param c Character to be mapped.
   1.237 + * @param iter Character iterator, used for context-sensitive mappings.
   1.238 + *             See UCaseContextIterator for details.
   1.239 + *             If iter==NULL then a context-independent result is returned.
   1.240 + * @param context Pointer to be passed into iter.
   1.241 + * @param pString If the mapping result is a string, then the pointer is
   1.242 + *                written to *pString.
   1.243 + * @param locale Locale ID for locale-dependent mappings.
   1.244 + * @param locCache Initialize to 0; may be used to cache the result of parsing
   1.245 + *                 the locale ID for subsequent calls.
   1.246 + *                 Can be NULL.
   1.247 + * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
   1.248 + *
   1.249 + * @see UCaseContextIterator
   1.250 + * @see UCASE_MAX_STRING_LENGTH
   1.251 + * @internal
   1.252 + */
   1.253 +U_CAPI int32_t U_EXPORT2
   1.254 +ucase_toFullLower(const UCaseProps *csp, UChar32 c,
   1.255 +                  UCaseContextIterator *iter, void *context,
   1.256 +                  const UChar **pString,
   1.257 +                  const char *locale, int32_t *locCache);
   1.258 +
   1.259 +U_CAPI int32_t U_EXPORT2
   1.260 +ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
   1.261 +                  UCaseContextIterator *iter, void *context,
   1.262 +                  const UChar **pString,
   1.263 +                  const char *locale, int32_t *locCache);
   1.264 +
   1.265 +U_CAPI int32_t U_EXPORT2
   1.266 +ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
   1.267 +                  UCaseContextIterator *iter, void *context,
   1.268 +                  const UChar **pString,
   1.269 +                  const char *locale, int32_t *locCache);
   1.270 +
   1.271 +U_CAPI int32_t U_EXPORT2
   1.272 +ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
   1.273 +                    const UChar **pString,
   1.274 +                    uint32_t options);
   1.275 +
   1.276 +U_CFUNC int32_t U_EXPORT2
   1.277 +ucase_hasBinaryProperty(UChar32 c, UProperty which);
   1.278 +
   1.279 +
   1.280 +U_CDECL_BEGIN
   1.281 +
   1.282 +/**
   1.283 + * @internal
   1.284 + */
   1.285 +typedef int32_t U_CALLCONV
   1.286 +UCaseMapFull(const UCaseProps *csp, UChar32 c,
   1.287 +             UCaseContextIterator *iter, void *context,
   1.288 +             const UChar **pString,
   1.289 +             const char *locale, int32_t *locCache);
   1.290 +
   1.291 +U_CDECL_END
   1.292 +
   1.293 +/* file definitions --------------------------------------------------------- */
   1.294 +
   1.295 +#define UCASE_DATA_NAME "ucase"
   1.296 +#define UCASE_DATA_TYPE "icu"
   1.297 +
   1.298 +/* format "cAsE" */
   1.299 +#define UCASE_FMT_0 0x63
   1.300 +#define UCASE_FMT_1 0x41
   1.301 +#define UCASE_FMT_2 0x53
   1.302 +#define UCASE_FMT_3 0x45
   1.303 +
   1.304 +/* indexes into indexes[] */
   1.305 +enum {
   1.306 +    UCASE_IX_INDEX_TOP,
   1.307 +    UCASE_IX_LENGTH,
   1.308 +    UCASE_IX_TRIE_SIZE,
   1.309 +    UCASE_IX_EXC_LENGTH,
   1.310 +    UCASE_IX_UNFOLD_LENGTH,
   1.311 +
   1.312 +    UCASE_IX_MAX_FULL_LENGTH=15,
   1.313 +    UCASE_IX_TOP=16
   1.314 +};
   1.315 +
   1.316 +/* definitions for 16-bit case properties word ------------------------------ */
   1.317 +
   1.318 +/* 2-bit constants for types of cased characters */
   1.319 +#define UCASE_TYPE_MASK     3
   1.320 +enum {
   1.321 +    UCASE_NONE,
   1.322 +    UCASE_LOWER,
   1.323 +    UCASE_UPPER,
   1.324 +    UCASE_TITLE
   1.325 +};
   1.326 +
   1.327 +#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
   1.328 +#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
   1.329 +
   1.330 +#define UCASE_IGNORABLE         4
   1.331 +#define UCASE_SENSITIVE         8
   1.332 +#define UCASE_EXCEPTION         0x10
   1.333 +
   1.334 +#define UCASE_DOT_MASK      0x60
   1.335 +enum {
   1.336 +    UCASE_NO_DOT=0,         /* normal characters with cc=0 */
   1.337 +    UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
   1.338 +    UCASE_ABOVE=0x40,       /* "above" accents with cc=230 */
   1.339 +    UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
   1.340 +};
   1.341 +
   1.342 +/* no exception: bits 15..7 are a 9-bit signed case mapping delta */
   1.343 +#define UCASE_DELTA_SHIFT   7
   1.344 +#define UCASE_DELTA_MASK    0xff80
   1.345 +#define UCASE_MAX_DELTA     0xff
   1.346 +#define UCASE_MIN_DELTA     (-UCASE_MAX_DELTA-1)
   1.347 +
   1.348 +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
   1.349 +#   define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
   1.350 +#else
   1.351 +#   define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
   1.352 +#endif
   1.353 +
   1.354 +/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
   1.355 +#define UCASE_EXC_SHIFT     5
   1.356 +#define UCASE_EXC_MASK      0xffe0
   1.357 +#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
   1.358 +
   1.359 +/* definitions for 16-bit main exceptions word ------------------------------ */
   1.360 +
   1.361 +/* first 8 bits indicate values in optional slots */
   1.362 +enum {
   1.363 +    UCASE_EXC_LOWER,
   1.364 +    UCASE_EXC_FOLD,
   1.365 +    UCASE_EXC_UPPER,
   1.366 +    UCASE_EXC_TITLE,
   1.367 +    UCASE_EXC_4,            /* reserved */
   1.368 +    UCASE_EXC_5,            /* reserved */
   1.369 +    UCASE_EXC_CLOSURE,
   1.370 +    UCASE_EXC_FULL_MAPPINGS,
   1.371 +    UCASE_EXC_ALL_SLOTS     /* one past the last slot */
   1.372 +};
   1.373 +
   1.374 +/* each slot is 2 uint16_t instead of 1 */
   1.375 +#define UCASE_EXC_DOUBLE_SLOTS      0x100
   1.376 +
   1.377 +/* reserved: exception bits 11..9 */
   1.378 +
   1.379 +/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
   1.380 +#define UCASE_EXC_DOT_SHIFT     7
   1.381 +
   1.382 +/* normally stored in the main word, but pushed out for larger exception indexes */
   1.383 +#define UCASE_EXC_DOT_MASK      0x3000
   1.384 +enum {
   1.385 +    UCASE_EXC_NO_DOT=0,
   1.386 +    UCASE_EXC_SOFT_DOTTED=0x1000,
   1.387 +    UCASE_EXC_ABOVE=0x2000,         /* "above" accents with cc=230 */
   1.388 +    UCASE_EXC_OTHER_ACCENT=0x3000   /* other character (0<cc!=230) */
   1.389 +};
   1.390 +
   1.391 +/* complex/conditional mappings */
   1.392 +#define UCASE_EXC_CONDITIONAL_SPECIAL   0x4000
   1.393 +#define UCASE_EXC_CONDITIONAL_FOLD      0x8000
   1.394 +
   1.395 +/* definitions for lengths word for full case mappings */
   1.396 +#define UCASE_FULL_LOWER    0xf
   1.397 +#define UCASE_FULL_FOLDING  0xf0
   1.398 +#define UCASE_FULL_UPPER    0xf00
   1.399 +#define UCASE_FULL_TITLE    0xf000
   1.400 +
   1.401 +/* maximum lengths */
   1.402 +#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
   1.403 +#define UCASE_CLOSURE_MAX_LENGTH 0xf
   1.404 +
   1.405 +/* constants for reverse case folding ("unfold") data */
   1.406 +enum {
   1.407 +    UCASE_UNFOLD_ROWS,
   1.408 +    UCASE_UNFOLD_ROW_WIDTH,
   1.409 +    UCASE_UNFOLD_STRING_WIDTH
   1.410 +};
   1.411 +
   1.412 +#endif

mercurial