intl/icu/source/common/ucase.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2004-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucase.h
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004aug30
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * Low-level Unicode character/string case mapping code.
michael@0 17 */
michael@0 18
michael@0 19 #ifndef __UCASE_H__
michael@0 20 #define __UCASE_H__
michael@0 21
michael@0 22 #include "unicode/utypes.h"
michael@0 23 #include "unicode/uset.h"
michael@0 24 #include "putilimp.h"
michael@0 25 #include "uset_imp.h"
michael@0 26 #include "udataswp.h"
michael@0 27
michael@0 28 #ifdef __cplusplus
michael@0 29 U_NAMESPACE_BEGIN
michael@0 30
michael@0 31 class UnicodeString;
michael@0 32
michael@0 33 U_NAMESPACE_END
michael@0 34 #endif
michael@0 35
michael@0 36 /* library API -------------------------------------------------------------- */
michael@0 37
michael@0 38 U_CDECL_BEGIN
michael@0 39
michael@0 40 struct UCaseProps;
michael@0 41 typedef struct UCaseProps UCaseProps;
michael@0 42
michael@0 43 U_CDECL_END
michael@0 44
michael@0 45 U_CAPI const UCaseProps * U_EXPORT2
michael@0 46 ucase_getSingleton(void);
michael@0 47
michael@0 48 U_CFUNC void U_EXPORT2
michael@0 49 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
michael@0 50
michael@0 51 /**
michael@0 52 * Requires non-NULL locale ID but otherwise does the equivalent of
michael@0 53 * checking for language codes as if uloc_getLanguage() were called:
michael@0 54 * Accepts both 2- and 3-letter codes and accepts case variants.
michael@0 55 */
michael@0 56 U_CFUNC int32_t
michael@0 57 ucase_getCaseLocale(const char *locale, int32_t *locCache);
michael@0 58
michael@0 59 /* Casing locale types for ucase_getCaseLocale */
michael@0 60 enum {
michael@0 61 UCASE_LOC_UNKNOWN,
michael@0 62 UCASE_LOC_ROOT,
michael@0 63 UCASE_LOC_TURKISH,
michael@0 64 UCASE_LOC_LITHUANIAN,
michael@0 65 UCASE_LOC_DUTCH
michael@0 66 };
michael@0 67
michael@0 68 /**
michael@0 69 * Bit mask for getting just the options from a string compare options word
michael@0 70 * that are relevant for case-insensitive string comparison.
michael@0 71 * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
michael@0 72 * @internal
michael@0 73 */
michael@0 74 #define _STRCASECMP_OPTIONS_MASK 0xffff
michael@0 75
michael@0 76 /**
michael@0 77 * Bit mask for getting just the options from a string compare options word
michael@0 78 * that are relevant for case folding (of a single string or code point).
michael@0 79 * See uchar.h.
michael@0 80 * @internal
michael@0 81 */
michael@0 82 #define _FOLD_CASE_OPTIONS_MASK 0xff
michael@0 83
michael@0 84 /* single-code point functions */
michael@0 85
michael@0 86 U_CAPI UChar32 U_EXPORT2
michael@0 87 ucase_tolower(const UCaseProps *csp, UChar32 c);
michael@0 88
michael@0 89 U_CAPI UChar32 U_EXPORT2
michael@0 90 ucase_toupper(const UCaseProps *csp, UChar32 c);
michael@0 91
michael@0 92 U_CAPI UChar32 U_EXPORT2
michael@0 93 ucase_totitle(const UCaseProps *csp, UChar32 c);
michael@0 94
michael@0 95 U_CAPI UChar32 U_EXPORT2
michael@0 96 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
michael@0 97
michael@0 98 /**
michael@0 99 * Adds all simple case mappings and the full case folding for c to sa,
michael@0 100 * and also adds special case closure mappings.
michael@0 101 * c itself is not added.
michael@0 102 * For example, the mappings
michael@0 103 * - for s include long s
michael@0 104 * - for sharp s include ss
michael@0 105 * - for k include the Kelvin sign
michael@0 106 */
michael@0 107 U_CFUNC void U_EXPORT2
michael@0 108 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
michael@0 109
michael@0 110 /**
michael@0 111 * Maps the string to single code points and adds the associated case closure
michael@0 112 * mappings.
michael@0 113 * The string is mapped to code points if it is their full case folding string.
michael@0 114 * In other words, this performs a reverse full case folding and then
michael@0 115 * adds the case closure items of the resulting code points.
michael@0 116 * If the string is found and its closure applied, then
michael@0 117 * the string itself is added as well as part of its code points' closure.
michael@0 118 * It must be length>=0.
michael@0 119 *
michael@0 120 * @return TRUE if the string was found
michael@0 121 */
michael@0 122 U_CFUNC UBool U_EXPORT2
michael@0 123 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
michael@0 124
michael@0 125 #ifdef __cplusplus
michael@0 126 U_NAMESPACE_BEGIN
michael@0 127
michael@0 128 /**
michael@0 129 * Iterator over characters with more than one code point in the full default Case_Folding.
michael@0 130 */
michael@0 131 class U_COMMON_API FullCaseFoldingIterator {
michael@0 132 public:
michael@0 133 /** Constructor. */
michael@0 134 FullCaseFoldingIterator();
michael@0 135 /**
michael@0 136 * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
michael@0 137 * Returns a negative cp value at the end of the iteration.
michael@0 138 */
michael@0 139 UChar32 next(UnicodeString &full);
michael@0 140 private:
michael@0 141 FullCaseFoldingIterator(const FullCaseFoldingIterator &); // no copy
michael@0 142 FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &); // no assignment
michael@0 143
michael@0 144 const UChar *unfold;
michael@0 145 int32_t unfoldRows;
michael@0 146 int32_t unfoldRowWidth;
michael@0 147 int32_t unfoldStringWidth;
michael@0 148 int32_t currentRow;
michael@0 149 int32_t rowCpIndex;
michael@0 150 };
michael@0 151
michael@0 152 U_NAMESPACE_END
michael@0 153 #endif
michael@0 154
michael@0 155 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
michael@0 156 U_CAPI int32_t U_EXPORT2
michael@0 157 ucase_getType(const UCaseProps *csp, UChar32 c);
michael@0 158
michael@0 159 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
michael@0 160 U_CAPI int32_t U_EXPORT2
michael@0 161 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
michael@0 162
michael@0 163 U_CAPI UBool U_EXPORT2
michael@0 164 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
michael@0 165
michael@0 166 U_CAPI UBool U_EXPORT2
michael@0 167 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
michael@0 168
michael@0 169 /* string case mapping functions */
michael@0 170
michael@0 171 U_CDECL_BEGIN
michael@0 172
michael@0 173 /**
michael@0 174 * Iterator function for string case mappings, which need to look at the
michael@0 175 * context (surrounding text) of a given character for conditional mappings.
michael@0 176 *
michael@0 177 * The iterator only needs to go backward or forward away from the
michael@0 178 * character in question. It does not use any indexes on this interface.
michael@0 179 * It does not support random access or an arbitrary change of
michael@0 180 * iteration direction.
michael@0 181 *
michael@0 182 * The code point being case-mapped itself is never returned by
michael@0 183 * this iterator.
michael@0 184 *
michael@0 185 * @param context A pointer to the iterator's working data.
michael@0 186 * @param dir If <0 then start iterating backward from the character;
michael@0 187 * if >0 then start iterating forward from the character;
michael@0 188 * if 0 then continue iterating in the current direction.
michael@0 189 * @return Next code point, or <0 when the iteration is done.
michael@0 190 */
michael@0 191 typedef UChar32 U_CALLCONV
michael@0 192 UCaseContextIterator(void *context, int8_t dir);
michael@0 193
michael@0 194 /**
michael@0 195 * Sample struct which may be used by some implementations of
michael@0 196 * UCaseContextIterator.
michael@0 197 */
michael@0 198 struct UCaseContext {
michael@0 199 void *p;
michael@0 200 int32_t start, index, limit;
michael@0 201 int32_t cpStart, cpLimit;
michael@0 202 int8_t dir;
michael@0 203 int8_t b1, b2, b3;
michael@0 204 };
michael@0 205 typedef struct UCaseContext UCaseContext;
michael@0 206
michael@0 207 U_CDECL_END
michael@0 208
michael@0 209 #define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
michael@0 210
michael@0 211 enum {
michael@0 212 /**
michael@0 213 * For string case mappings, a single character (a code point) is mapped
michael@0 214 * either to itself (in which case in-place mapping functions do nothing),
michael@0 215 * or to another single code point, or to a string.
michael@0 216 * Aside from the string contents, these are indicated with a single int32_t
michael@0 217 * value as follows:
michael@0 218 *
michael@0 219 * Mapping to self: Negative values (~self instead of -self to support U+0000)
michael@0 220 *
michael@0 221 * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
michael@0 222 *
michael@0 223 * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
michael@0 224 * returned. Note that the string result may indeed have zero length.
michael@0 225 */
michael@0 226 UCASE_MAX_STRING_LENGTH=0x1f
michael@0 227 };
michael@0 228
michael@0 229 /**
michael@0 230 * Get the full lowercase mapping for c.
michael@0 231 *
michael@0 232 * @param csp Case mapping properties.
michael@0 233 * @param c Character to be mapped.
michael@0 234 * @param iter Character iterator, used for context-sensitive mappings.
michael@0 235 * See UCaseContextIterator for details.
michael@0 236 * If iter==NULL then a context-independent result is returned.
michael@0 237 * @param context Pointer to be passed into iter.
michael@0 238 * @param pString If the mapping result is a string, then the pointer is
michael@0 239 * written to *pString.
michael@0 240 * @param locale Locale ID for locale-dependent mappings.
michael@0 241 * @param locCache Initialize to 0; may be used to cache the result of parsing
michael@0 242 * the locale ID for subsequent calls.
michael@0 243 * Can be NULL.
michael@0 244 * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
michael@0 245 *
michael@0 246 * @see UCaseContextIterator
michael@0 247 * @see UCASE_MAX_STRING_LENGTH
michael@0 248 * @internal
michael@0 249 */
michael@0 250 U_CAPI int32_t U_EXPORT2
michael@0 251 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
michael@0 252 UCaseContextIterator *iter, void *context,
michael@0 253 const UChar **pString,
michael@0 254 const char *locale, int32_t *locCache);
michael@0 255
michael@0 256 U_CAPI int32_t U_EXPORT2
michael@0 257 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
michael@0 258 UCaseContextIterator *iter, void *context,
michael@0 259 const UChar **pString,
michael@0 260 const char *locale, int32_t *locCache);
michael@0 261
michael@0 262 U_CAPI int32_t U_EXPORT2
michael@0 263 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
michael@0 264 UCaseContextIterator *iter, void *context,
michael@0 265 const UChar **pString,
michael@0 266 const char *locale, int32_t *locCache);
michael@0 267
michael@0 268 U_CAPI int32_t U_EXPORT2
michael@0 269 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
michael@0 270 const UChar **pString,
michael@0 271 uint32_t options);
michael@0 272
michael@0 273 U_CFUNC int32_t U_EXPORT2
michael@0 274 ucase_hasBinaryProperty(UChar32 c, UProperty which);
michael@0 275
michael@0 276
michael@0 277 U_CDECL_BEGIN
michael@0 278
michael@0 279 /**
michael@0 280 * @internal
michael@0 281 */
michael@0 282 typedef int32_t U_CALLCONV
michael@0 283 UCaseMapFull(const UCaseProps *csp, UChar32 c,
michael@0 284 UCaseContextIterator *iter, void *context,
michael@0 285 const UChar **pString,
michael@0 286 const char *locale, int32_t *locCache);
michael@0 287
michael@0 288 U_CDECL_END
michael@0 289
michael@0 290 /* file definitions --------------------------------------------------------- */
michael@0 291
michael@0 292 #define UCASE_DATA_NAME "ucase"
michael@0 293 #define UCASE_DATA_TYPE "icu"
michael@0 294
michael@0 295 /* format "cAsE" */
michael@0 296 #define UCASE_FMT_0 0x63
michael@0 297 #define UCASE_FMT_1 0x41
michael@0 298 #define UCASE_FMT_2 0x53
michael@0 299 #define UCASE_FMT_3 0x45
michael@0 300
michael@0 301 /* indexes into indexes[] */
michael@0 302 enum {
michael@0 303 UCASE_IX_INDEX_TOP,
michael@0 304 UCASE_IX_LENGTH,
michael@0 305 UCASE_IX_TRIE_SIZE,
michael@0 306 UCASE_IX_EXC_LENGTH,
michael@0 307 UCASE_IX_UNFOLD_LENGTH,
michael@0 308
michael@0 309 UCASE_IX_MAX_FULL_LENGTH=15,
michael@0 310 UCASE_IX_TOP=16
michael@0 311 };
michael@0 312
michael@0 313 /* definitions for 16-bit case properties word ------------------------------ */
michael@0 314
michael@0 315 /* 2-bit constants for types of cased characters */
michael@0 316 #define UCASE_TYPE_MASK 3
michael@0 317 enum {
michael@0 318 UCASE_NONE,
michael@0 319 UCASE_LOWER,
michael@0 320 UCASE_UPPER,
michael@0 321 UCASE_TITLE
michael@0 322 };
michael@0 323
michael@0 324 #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
michael@0 325 #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
michael@0 326
michael@0 327 #define UCASE_IGNORABLE 4
michael@0 328 #define UCASE_SENSITIVE 8
michael@0 329 #define UCASE_EXCEPTION 0x10
michael@0 330
michael@0 331 #define UCASE_DOT_MASK 0x60
michael@0 332 enum {
michael@0 333 UCASE_NO_DOT=0, /* normal characters with cc=0 */
michael@0 334 UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
michael@0 335 UCASE_ABOVE=0x40, /* "above" accents with cc=230 */
michael@0 336 UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
michael@0 337 };
michael@0 338
michael@0 339 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
michael@0 340 #define UCASE_DELTA_SHIFT 7
michael@0 341 #define UCASE_DELTA_MASK 0xff80
michael@0 342 #define UCASE_MAX_DELTA 0xff
michael@0 343 #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1)
michael@0 344
michael@0 345 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
michael@0 346 # define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
michael@0 347 #else
michael@0 348 # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
michael@0 349 #endif
michael@0 350
michael@0 351 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
michael@0 352 #define UCASE_EXC_SHIFT 5
michael@0 353 #define UCASE_EXC_MASK 0xffe0
michael@0 354 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
michael@0 355
michael@0 356 /* definitions for 16-bit main exceptions word ------------------------------ */
michael@0 357
michael@0 358 /* first 8 bits indicate values in optional slots */
michael@0 359 enum {
michael@0 360 UCASE_EXC_LOWER,
michael@0 361 UCASE_EXC_FOLD,
michael@0 362 UCASE_EXC_UPPER,
michael@0 363 UCASE_EXC_TITLE,
michael@0 364 UCASE_EXC_4, /* reserved */
michael@0 365 UCASE_EXC_5, /* reserved */
michael@0 366 UCASE_EXC_CLOSURE,
michael@0 367 UCASE_EXC_FULL_MAPPINGS,
michael@0 368 UCASE_EXC_ALL_SLOTS /* one past the last slot */
michael@0 369 };
michael@0 370
michael@0 371 /* each slot is 2 uint16_t instead of 1 */
michael@0 372 #define UCASE_EXC_DOUBLE_SLOTS 0x100
michael@0 373
michael@0 374 /* reserved: exception bits 11..9 */
michael@0 375
michael@0 376 /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
michael@0 377 #define UCASE_EXC_DOT_SHIFT 7
michael@0 378
michael@0 379 /* normally stored in the main word, but pushed out for larger exception indexes */
michael@0 380 #define UCASE_EXC_DOT_MASK 0x3000
michael@0 381 enum {
michael@0 382 UCASE_EXC_NO_DOT=0,
michael@0 383 UCASE_EXC_SOFT_DOTTED=0x1000,
michael@0 384 UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */
michael@0 385 UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */
michael@0 386 };
michael@0 387
michael@0 388 /* complex/conditional mappings */
michael@0 389 #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000
michael@0 390 #define UCASE_EXC_CONDITIONAL_FOLD 0x8000
michael@0 391
michael@0 392 /* definitions for lengths word for full case mappings */
michael@0 393 #define UCASE_FULL_LOWER 0xf
michael@0 394 #define UCASE_FULL_FOLDING 0xf0
michael@0 395 #define UCASE_FULL_UPPER 0xf00
michael@0 396 #define UCASE_FULL_TITLE 0xf000
michael@0 397
michael@0 398 /* maximum lengths */
michael@0 399 #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
michael@0 400 #define UCASE_CLOSURE_MAX_LENGTH 0xf
michael@0 401
michael@0 402 /* constants for reverse case folding ("unfold") data */
michael@0 403 enum {
michael@0 404 UCASE_UNFOLD_ROWS,
michael@0 405 UCASE_UNFOLD_ROW_WIDTH,
michael@0 406 UCASE_UNFOLD_STRING_WIDTH
michael@0 407 };
michael@0 408
michael@0 409 #endif

mercurial