1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucase.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,409 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2004-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucase.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2004aug30 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Low-level Unicode character/string case mapping code. 1.20 +*/ 1.21 + 1.22 +#ifndef __UCASE_H__ 1.23 +#define __UCASE_H__ 1.24 + 1.25 +#include "unicode/utypes.h" 1.26 +#include "unicode/uset.h" 1.27 +#include "putilimp.h" 1.28 +#include "uset_imp.h" 1.29 +#include "udataswp.h" 1.30 + 1.31 +#ifdef __cplusplus 1.32 +U_NAMESPACE_BEGIN 1.33 + 1.34 +class UnicodeString; 1.35 + 1.36 +U_NAMESPACE_END 1.37 +#endif 1.38 + 1.39 +/* library API -------------------------------------------------------------- */ 1.40 + 1.41 +U_CDECL_BEGIN 1.42 + 1.43 +struct UCaseProps; 1.44 +typedef struct UCaseProps UCaseProps; 1.45 + 1.46 +U_CDECL_END 1.47 + 1.48 +U_CAPI const UCaseProps * U_EXPORT2 1.49 +ucase_getSingleton(void); 1.50 + 1.51 +U_CFUNC void U_EXPORT2 1.52 +ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode); 1.53 + 1.54 +/** 1.55 + * Requires non-NULL locale ID but otherwise does the equivalent of 1.56 + * checking for language codes as if uloc_getLanguage() were called: 1.57 + * Accepts both 2- and 3-letter codes and accepts case variants. 1.58 + */ 1.59 +U_CFUNC int32_t 1.60 +ucase_getCaseLocale(const char *locale, int32_t *locCache); 1.61 + 1.62 +/* Casing locale types for ucase_getCaseLocale */ 1.63 +enum { 1.64 + UCASE_LOC_UNKNOWN, 1.65 + UCASE_LOC_ROOT, 1.66 + UCASE_LOC_TURKISH, 1.67 + UCASE_LOC_LITHUANIAN, 1.68 + UCASE_LOC_DUTCH 1.69 +}; 1.70 + 1.71 +/** 1.72 + * Bit mask for getting just the options from a string compare options word 1.73 + * that are relevant for case-insensitive string comparison. 1.74 + * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. 1.75 + * @internal 1.76 + */ 1.77 +#define _STRCASECMP_OPTIONS_MASK 0xffff 1.78 + 1.79 +/** 1.80 + * Bit mask for getting just the options from a string compare options word 1.81 + * that are relevant for case folding (of a single string or code point). 1.82 + * See uchar.h. 1.83 + * @internal 1.84 + */ 1.85 +#define _FOLD_CASE_OPTIONS_MASK 0xff 1.86 + 1.87 +/* single-code point functions */ 1.88 + 1.89 +U_CAPI UChar32 U_EXPORT2 1.90 +ucase_tolower(const UCaseProps *csp, UChar32 c); 1.91 + 1.92 +U_CAPI UChar32 U_EXPORT2 1.93 +ucase_toupper(const UCaseProps *csp, UChar32 c); 1.94 + 1.95 +U_CAPI UChar32 U_EXPORT2 1.96 +ucase_totitle(const UCaseProps *csp, UChar32 c); 1.97 + 1.98 +U_CAPI UChar32 U_EXPORT2 1.99 +ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options); 1.100 + 1.101 +/** 1.102 + * Adds all simple case mappings and the full case folding for c to sa, 1.103 + * and also adds special case closure mappings. 1.104 + * c itself is not added. 1.105 + * For example, the mappings 1.106 + * - for s include long s 1.107 + * - for sharp s include ss 1.108 + * - for k include the Kelvin sign 1.109 + */ 1.110 +U_CFUNC void U_EXPORT2 1.111 +ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa); 1.112 + 1.113 +/** 1.114 + * Maps the string to single code points and adds the associated case closure 1.115 + * mappings. 1.116 + * The string is mapped to code points if it is their full case folding string. 1.117 + * In other words, this performs a reverse full case folding and then 1.118 + * adds the case closure items of the resulting code points. 1.119 + * If the string is found and its closure applied, then 1.120 + * the string itself is added as well as part of its code points' closure. 1.121 + * It must be length>=0. 1.122 + * 1.123 + * @return TRUE if the string was found 1.124 + */ 1.125 +U_CFUNC UBool U_EXPORT2 1.126 +ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa); 1.127 + 1.128 +#ifdef __cplusplus 1.129 +U_NAMESPACE_BEGIN 1.130 + 1.131 +/** 1.132 + * Iterator over characters with more than one code point in the full default Case_Folding. 1.133 + */ 1.134 +class U_COMMON_API FullCaseFoldingIterator { 1.135 +public: 1.136 + /** Constructor. */ 1.137 + FullCaseFoldingIterator(); 1.138 + /** 1.139 + * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding. 1.140 + * Returns a negative cp value at the end of the iteration. 1.141 + */ 1.142 + UChar32 next(UnicodeString &full); 1.143 +private: 1.144 + FullCaseFoldingIterator(const FullCaseFoldingIterator &); // no copy 1.145 + FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &); // no assignment 1.146 + 1.147 + const UChar *unfold; 1.148 + int32_t unfoldRows; 1.149 + int32_t unfoldRowWidth; 1.150 + int32_t unfoldStringWidth; 1.151 + int32_t currentRow; 1.152 + int32_t rowCpIndex; 1.153 +}; 1.154 + 1.155 +U_NAMESPACE_END 1.156 +#endif 1.157 + 1.158 +/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 1.159 +U_CAPI int32_t U_EXPORT2 1.160 +ucase_getType(const UCaseProps *csp, UChar32 c); 1.161 + 1.162 +/** @return same as ucase_getType(), or <0 if c is case-ignorable */ 1.163 +U_CAPI int32_t U_EXPORT2 1.164 +ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c); 1.165 + 1.166 +U_CAPI UBool U_EXPORT2 1.167 +ucase_isSoftDotted(const UCaseProps *csp, UChar32 c); 1.168 + 1.169 +U_CAPI UBool U_EXPORT2 1.170 +ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c); 1.171 + 1.172 +/* string case mapping functions */ 1.173 + 1.174 +U_CDECL_BEGIN 1.175 + 1.176 +/** 1.177 + * Iterator function for string case mappings, which need to look at the 1.178 + * context (surrounding text) of a given character for conditional mappings. 1.179 + * 1.180 + * The iterator only needs to go backward or forward away from the 1.181 + * character in question. It does not use any indexes on this interface. 1.182 + * It does not support random access or an arbitrary change of 1.183 + * iteration direction. 1.184 + * 1.185 + * The code point being case-mapped itself is never returned by 1.186 + * this iterator. 1.187 + * 1.188 + * @param context A pointer to the iterator's working data. 1.189 + * @param dir If <0 then start iterating backward from the character; 1.190 + * if >0 then start iterating forward from the character; 1.191 + * if 0 then continue iterating in the current direction. 1.192 + * @return Next code point, or <0 when the iteration is done. 1.193 + */ 1.194 +typedef UChar32 U_CALLCONV 1.195 +UCaseContextIterator(void *context, int8_t dir); 1.196 + 1.197 +/** 1.198 + * Sample struct which may be used by some implementations of 1.199 + * UCaseContextIterator. 1.200 + */ 1.201 +struct UCaseContext { 1.202 + void *p; 1.203 + int32_t start, index, limit; 1.204 + int32_t cpStart, cpLimit; 1.205 + int8_t dir; 1.206 + int8_t b1, b2, b3; 1.207 +}; 1.208 +typedef struct UCaseContext UCaseContext; 1.209 + 1.210 +U_CDECL_END 1.211 + 1.212 +#define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 } 1.213 + 1.214 +enum { 1.215 + /** 1.216 + * For string case mappings, a single character (a code point) is mapped 1.217 + * either to itself (in which case in-place mapping functions do nothing), 1.218 + * or to another single code point, or to a string. 1.219 + * Aside from the string contents, these are indicated with a single int32_t 1.220 + * value as follows: 1.221 + * 1.222 + * Mapping to self: Negative values (~self instead of -self to support U+0000) 1.223 + * 1.224 + * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH 1.225 + * 1.226 + * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is 1.227 + * returned. Note that the string result may indeed have zero length. 1.228 + */ 1.229 + UCASE_MAX_STRING_LENGTH=0x1f 1.230 +}; 1.231 + 1.232 +/** 1.233 + * Get the full lowercase mapping for c. 1.234 + * 1.235 + * @param csp Case mapping properties. 1.236 + * @param c Character to be mapped. 1.237 + * @param iter Character iterator, used for context-sensitive mappings. 1.238 + * See UCaseContextIterator for details. 1.239 + * If iter==NULL then a context-independent result is returned. 1.240 + * @param context Pointer to be passed into iter. 1.241 + * @param pString If the mapping result is a string, then the pointer is 1.242 + * written to *pString. 1.243 + * @param locale Locale ID for locale-dependent mappings. 1.244 + * @param locCache Initialize to 0; may be used to cache the result of parsing 1.245 + * the locale ID for subsequent calls. 1.246 + * Can be NULL. 1.247 + * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. 1.248 + * 1.249 + * @see UCaseContextIterator 1.250 + * @see UCASE_MAX_STRING_LENGTH 1.251 + * @internal 1.252 + */ 1.253 +U_CAPI int32_t U_EXPORT2 1.254 +ucase_toFullLower(const UCaseProps *csp, UChar32 c, 1.255 + UCaseContextIterator *iter, void *context, 1.256 + const UChar **pString, 1.257 + const char *locale, int32_t *locCache); 1.258 + 1.259 +U_CAPI int32_t U_EXPORT2 1.260 +ucase_toFullUpper(const UCaseProps *csp, UChar32 c, 1.261 + UCaseContextIterator *iter, void *context, 1.262 + const UChar **pString, 1.263 + const char *locale, int32_t *locCache); 1.264 + 1.265 +U_CAPI int32_t U_EXPORT2 1.266 +ucase_toFullTitle(const UCaseProps *csp, UChar32 c, 1.267 + UCaseContextIterator *iter, void *context, 1.268 + const UChar **pString, 1.269 + const char *locale, int32_t *locCache); 1.270 + 1.271 +U_CAPI int32_t U_EXPORT2 1.272 +ucase_toFullFolding(const UCaseProps *csp, UChar32 c, 1.273 + const UChar **pString, 1.274 + uint32_t options); 1.275 + 1.276 +U_CFUNC int32_t U_EXPORT2 1.277 +ucase_hasBinaryProperty(UChar32 c, UProperty which); 1.278 + 1.279 + 1.280 +U_CDECL_BEGIN 1.281 + 1.282 +/** 1.283 + * @internal 1.284 + */ 1.285 +typedef int32_t U_CALLCONV 1.286 +UCaseMapFull(const UCaseProps *csp, UChar32 c, 1.287 + UCaseContextIterator *iter, void *context, 1.288 + const UChar **pString, 1.289 + const char *locale, int32_t *locCache); 1.290 + 1.291 +U_CDECL_END 1.292 + 1.293 +/* file definitions --------------------------------------------------------- */ 1.294 + 1.295 +#define UCASE_DATA_NAME "ucase" 1.296 +#define UCASE_DATA_TYPE "icu" 1.297 + 1.298 +/* format "cAsE" */ 1.299 +#define UCASE_FMT_0 0x63 1.300 +#define UCASE_FMT_1 0x41 1.301 +#define UCASE_FMT_2 0x53 1.302 +#define UCASE_FMT_3 0x45 1.303 + 1.304 +/* indexes into indexes[] */ 1.305 +enum { 1.306 + UCASE_IX_INDEX_TOP, 1.307 + UCASE_IX_LENGTH, 1.308 + UCASE_IX_TRIE_SIZE, 1.309 + UCASE_IX_EXC_LENGTH, 1.310 + UCASE_IX_UNFOLD_LENGTH, 1.311 + 1.312 + UCASE_IX_MAX_FULL_LENGTH=15, 1.313 + UCASE_IX_TOP=16 1.314 +}; 1.315 + 1.316 +/* definitions for 16-bit case properties word ------------------------------ */ 1.317 + 1.318 +/* 2-bit constants for types of cased characters */ 1.319 +#define UCASE_TYPE_MASK 3 1.320 +enum { 1.321 + UCASE_NONE, 1.322 + UCASE_LOWER, 1.323 + UCASE_UPPER, 1.324 + UCASE_TITLE 1.325 +}; 1.326 + 1.327 +#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) 1.328 +#define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) 1.329 + 1.330 +#define UCASE_IGNORABLE 4 1.331 +#define UCASE_SENSITIVE 8 1.332 +#define UCASE_EXCEPTION 0x10 1.333 + 1.334 +#define UCASE_DOT_MASK 0x60 1.335 +enum { 1.336 + UCASE_NO_DOT=0, /* normal characters with cc=0 */ 1.337 + UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */ 1.338 + UCASE_ABOVE=0x40, /* "above" accents with cc=230 */ 1.339 + UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */ 1.340 +}; 1.341 + 1.342 +/* no exception: bits 15..7 are a 9-bit signed case mapping delta */ 1.343 +#define UCASE_DELTA_SHIFT 7 1.344 +#define UCASE_DELTA_MASK 0xff80 1.345 +#define UCASE_MAX_DELTA 0xff 1.346 +#define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) 1.347 + 1.348 +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1.349 +# define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) 1.350 +#else 1.351 +# define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) 1.352 +#endif 1.353 + 1.354 +/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ 1.355 +#define UCASE_EXC_SHIFT 5 1.356 +#define UCASE_EXC_MASK 0xffe0 1.357 +#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) 1.358 + 1.359 +/* definitions for 16-bit main exceptions word ------------------------------ */ 1.360 + 1.361 +/* first 8 bits indicate values in optional slots */ 1.362 +enum { 1.363 + UCASE_EXC_LOWER, 1.364 + UCASE_EXC_FOLD, 1.365 + UCASE_EXC_UPPER, 1.366 + UCASE_EXC_TITLE, 1.367 + UCASE_EXC_4, /* reserved */ 1.368 + UCASE_EXC_5, /* reserved */ 1.369 + UCASE_EXC_CLOSURE, 1.370 + UCASE_EXC_FULL_MAPPINGS, 1.371 + UCASE_EXC_ALL_SLOTS /* one past the last slot */ 1.372 +}; 1.373 + 1.374 +/* each slot is 2 uint16_t instead of 1 */ 1.375 +#define UCASE_EXC_DOUBLE_SLOTS 0x100 1.376 + 1.377 +/* reserved: exception bits 11..9 */ 1.378 + 1.379 +/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ 1.380 +#define UCASE_EXC_DOT_SHIFT 7 1.381 + 1.382 +/* normally stored in the main word, but pushed out for larger exception indexes */ 1.383 +#define UCASE_EXC_DOT_MASK 0x3000 1.384 +enum { 1.385 + UCASE_EXC_NO_DOT=0, 1.386 + UCASE_EXC_SOFT_DOTTED=0x1000, 1.387 + UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ 1.388 + UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ 1.389 +}; 1.390 + 1.391 +/* complex/conditional mappings */ 1.392 +#define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 1.393 +#define UCASE_EXC_CONDITIONAL_FOLD 0x8000 1.394 + 1.395 +/* definitions for lengths word for full case mappings */ 1.396 +#define UCASE_FULL_LOWER 0xf 1.397 +#define UCASE_FULL_FOLDING 0xf0 1.398 +#define UCASE_FULL_UPPER 0xf00 1.399 +#define UCASE_FULL_TITLE 0xf000 1.400 + 1.401 +/* maximum lengths */ 1.402 +#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf) 1.403 +#define UCASE_CLOSURE_MAX_LENGTH 0xf 1.404 + 1.405 +/* constants for reverse case folding ("unfold") data */ 1.406 +enum { 1.407 + UCASE_UNFOLD_ROWS, 1.408 + UCASE_UNFOLD_ROW_WIDTH, 1.409 + UCASE_UNFOLD_STRING_WIDTH 1.410 +}; 1.411 + 1.412 +#endif