intl/icu/source/common/ucase.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2004-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucase.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2004aug30
    14 *   created by: Markus W. Scherer
    15 *
    16 *   Low-level Unicode character/string case mapping code.
    17 */
    19 #ifndef __UCASE_H__
    20 #define __UCASE_H__
    22 #include "unicode/utypes.h"
    23 #include "unicode/uset.h"
    24 #include "putilimp.h"
    25 #include "uset_imp.h"
    26 #include "udataswp.h"
    28 #ifdef __cplusplus
    29 U_NAMESPACE_BEGIN
    31 class UnicodeString;
    33 U_NAMESPACE_END
    34 #endif
    36 /* library API -------------------------------------------------------------- */
    38 U_CDECL_BEGIN
    40 struct UCaseProps;
    41 typedef struct UCaseProps UCaseProps;
    43 U_CDECL_END
    45 U_CAPI const UCaseProps * U_EXPORT2
    46 ucase_getSingleton(void);
    48 U_CFUNC void U_EXPORT2
    49 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
    51 /**
    52  * Requires non-NULL locale ID but otherwise does the equivalent of
    53  * checking for language codes as if uloc_getLanguage() were called:
    54  * Accepts both 2- and 3-letter codes and accepts case variants.
    55  */
    56 U_CFUNC int32_t
    57 ucase_getCaseLocale(const char *locale, int32_t *locCache);
    59 /* Casing locale types for ucase_getCaseLocale */
    60 enum {
    61     UCASE_LOC_UNKNOWN,
    62     UCASE_LOC_ROOT,
    63     UCASE_LOC_TURKISH,
    64     UCASE_LOC_LITHUANIAN,
    65     UCASE_LOC_DUTCH
    66 };
    68 /**
    69  * Bit mask for getting just the options from a string compare options word
    70  * that are relevant for case-insensitive string comparison.
    71  * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
    72  * @internal
    73  */
    74 #define _STRCASECMP_OPTIONS_MASK 0xffff
    76 /**
    77  * Bit mask for getting just the options from a string compare options word
    78  * that are relevant for case folding (of a single string or code point).
    79  * See uchar.h.
    80  * @internal
    81  */
    82 #define _FOLD_CASE_OPTIONS_MASK 0xff
    84 /* single-code point functions */
    86 U_CAPI UChar32 U_EXPORT2
    87 ucase_tolower(const UCaseProps *csp, UChar32 c);
    89 U_CAPI UChar32 U_EXPORT2
    90 ucase_toupper(const UCaseProps *csp, UChar32 c);
    92 U_CAPI UChar32 U_EXPORT2
    93 ucase_totitle(const UCaseProps *csp, UChar32 c);
    95 U_CAPI UChar32 U_EXPORT2
    96 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
    98 /**
    99  * Adds all simple case mappings and the full case folding for c to sa,
   100  * and also adds special case closure mappings.
   101  * c itself is not added.
   102  * For example, the mappings
   103  * - for s include long s
   104  * - for sharp s include ss
   105  * - for k include the Kelvin sign
   106  */
   107 U_CFUNC void U_EXPORT2
   108 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
   110 /**
   111  * Maps the string to single code points and adds the associated case closure
   112  * mappings.
   113  * The string is mapped to code points if it is their full case folding string.
   114  * In other words, this performs a reverse full case folding and then
   115  * adds the case closure items of the resulting code points.
   116  * If the string is found and its closure applied, then
   117  * the string itself is added as well as part of its code points' closure.
   118  * It must be length>=0.
   119  *
   120  * @return TRUE if the string was found
   121  */
   122 U_CFUNC UBool U_EXPORT2
   123 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
   125 #ifdef __cplusplus
   126 U_NAMESPACE_BEGIN
   128 /**
   129  * Iterator over characters with more than one code point in the full default Case_Folding.
   130  */
   131 class U_COMMON_API FullCaseFoldingIterator {
   132 public:
   133     /** Constructor. */
   134     FullCaseFoldingIterator();
   135     /**
   136      * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
   137      * Returns a negative cp value at the end of the iteration.
   138      */
   139     UChar32 next(UnicodeString &full);
   140 private:
   141     FullCaseFoldingIterator(const FullCaseFoldingIterator &);  // no copy
   142     FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &);  // no assignment
   144     const UChar *unfold;
   145     int32_t unfoldRows;
   146     int32_t unfoldRowWidth;
   147     int32_t unfoldStringWidth;
   148     int32_t currentRow;
   149     int32_t rowCpIndex;
   150 };
   152 U_NAMESPACE_END
   153 #endif
   155 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
   156 U_CAPI int32_t U_EXPORT2
   157 ucase_getType(const UCaseProps *csp, UChar32 c);
   159 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
   160 U_CAPI int32_t U_EXPORT2
   161 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
   163 U_CAPI UBool U_EXPORT2
   164 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
   166 U_CAPI UBool U_EXPORT2
   167 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
   169 /* string case mapping functions */
   171 U_CDECL_BEGIN
   173 /**
   174  * Iterator function for string case mappings, which need to look at the
   175  * context (surrounding text) of a given character for conditional mappings.
   176  *
   177  * The iterator only needs to go backward or forward away from the
   178  * character in question. It does not use any indexes on this interface.
   179  * It does not support random access or an arbitrary change of
   180  * iteration direction.
   181  *
   182  * The code point being case-mapped itself is never returned by
   183  * this iterator.
   184  *
   185  * @param context A pointer to the iterator's working data.
   186  * @param dir If <0 then start iterating backward from the character;
   187  *            if >0 then start iterating forward from the character;
   188  *            if 0 then continue iterating in the current direction.
   189  * @return Next code point, or <0 when the iteration is done.
   190  */
   191 typedef UChar32 U_CALLCONV
   192 UCaseContextIterator(void *context, int8_t dir);
   194 /**
   195  * Sample struct which may be used by some implementations of
   196  * UCaseContextIterator.
   197  */
   198 struct UCaseContext {
   199     void *p;
   200     int32_t start, index, limit;
   201     int32_t cpStart, cpLimit;
   202     int8_t dir;
   203     int8_t b1, b2, b3;
   204 };
   205 typedef struct UCaseContext UCaseContext;
   207 U_CDECL_END
   209 #define UCASECONTEXT_INITIALIZER { NULL,  0, 0, 0,  0, 0,  0,  0, 0, 0 }
   211 enum {
   212     /**
   213      * For string case mappings, a single character (a code point) is mapped
   214      * either to itself (in which case in-place mapping functions do nothing),
   215      * or to another single code point, or to a string.
   216      * Aside from the string contents, these are indicated with a single int32_t
   217      * value as follows:
   218      *
   219      * Mapping to self: Negative values (~self instead of -self to support U+0000)
   220      *
   221      * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
   222      *
   223      * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
   224      * returned. Note that the string result may indeed have zero length.
   225      */
   226     UCASE_MAX_STRING_LENGTH=0x1f
   227 };
   229 /**
   230  * Get the full lowercase mapping for c.
   231  *
   232  * @param csp Case mapping properties.
   233  * @param c Character to be mapped.
   234  * @param iter Character iterator, used for context-sensitive mappings.
   235  *             See UCaseContextIterator for details.
   236  *             If iter==NULL then a context-independent result is returned.
   237  * @param context Pointer to be passed into iter.
   238  * @param pString If the mapping result is a string, then the pointer is
   239  *                written to *pString.
   240  * @param locale Locale ID for locale-dependent mappings.
   241  * @param locCache Initialize to 0; may be used to cache the result of parsing
   242  *                 the locale ID for subsequent calls.
   243  *                 Can be NULL.
   244  * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
   245  *
   246  * @see UCaseContextIterator
   247  * @see UCASE_MAX_STRING_LENGTH
   248  * @internal
   249  */
   250 U_CAPI int32_t U_EXPORT2
   251 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
   252                   UCaseContextIterator *iter, void *context,
   253                   const UChar **pString,
   254                   const char *locale, int32_t *locCache);
   256 U_CAPI int32_t U_EXPORT2
   257 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
   258                   UCaseContextIterator *iter, void *context,
   259                   const UChar **pString,
   260                   const char *locale, int32_t *locCache);
   262 U_CAPI int32_t U_EXPORT2
   263 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
   264                   UCaseContextIterator *iter, void *context,
   265                   const UChar **pString,
   266                   const char *locale, int32_t *locCache);
   268 U_CAPI int32_t U_EXPORT2
   269 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
   270                     const UChar **pString,
   271                     uint32_t options);
   273 U_CFUNC int32_t U_EXPORT2
   274 ucase_hasBinaryProperty(UChar32 c, UProperty which);
   277 U_CDECL_BEGIN
   279 /**
   280  * @internal
   281  */
   282 typedef int32_t U_CALLCONV
   283 UCaseMapFull(const UCaseProps *csp, UChar32 c,
   284              UCaseContextIterator *iter, void *context,
   285              const UChar **pString,
   286              const char *locale, int32_t *locCache);
   288 U_CDECL_END
   290 /* file definitions --------------------------------------------------------- */
   292 #define UCASE_DATA_NAME "ucase"
   293 #define UCASE_DATA_TYPE "icu"
   295 /* format "cAsE" */
   296 #define UCASE_FMT_0 0x63
   297 #define UCASE_FMT_1 0x41
   298 #define UCASE_FMT_2 0x53
   299 #define UCASE_FMT_3 0x45
   301 /* indexes into indexes[] */
   302 enum {
   303     UCASE_IX_INDEX_TOP,
   304     UCASE_IX_LENGTH,
   305     UCASE_IX_TRIE_SIZE,
   306     UCASE_IX_EXC_LENGTH,
   307     UCASE_IX_UNFOLD_LENGTH,
   309     UCASE_IX_MAX_FULL_LENGTH=15,
   310     UCASE_IX_TOP=16
   311 };
   313 /* definitions for 16-bit case properties word ------------------------------ */
   315 /* 2-bit constants for types of cased characters */
   316 #define UCASE_TYPE_MASK     3
   317 enum {
   318     UCASE_NONE,
   319     UCASE_LOWER,
   320     UCASE_UPPER,
   321     UCASE_TITLE
   322 };
   324 #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
   325 #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
   327 #define UCASE_IGNORABLE         4
   328 #define UCASE_SENSITIVE         8
   329 #define UCASE_EXCEPTION         0x10
   331 #define UCASE_DOT_MASK      0x60
   332 enum {
   333     UCASE_NO_DOT=0,         /* normal characters with cc=0 */
   334     UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
   335     UCASE_ABOVE=0x40,       /* "above" accents with cc=230 */
   336     UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
   337 };
   339 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
   340 #define UCASE_DELTA_SHIFT   7
   341 #define UCASE_DELTA_MASK    0xff80
   342 #define UCASE_MAX_DELTA     0xff
   343 #define UCASE_MIN_DELTA     (-UCASE_MAX_DELTA-1)
   345 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
   346 #   define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
   347 #else
   348 #   define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
   349 #endif
   351 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
   352 #define UCASE_EXC_SHIFT     5
   353 #define UCASE_EXC_MASK      0xffe0
   354 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
   356 /* definitions for 16-bit main exceptions word ------------------------------ */
   358 /* first 8 bits indicate values in optional slots */
   359 enum {
   360     UCASE_EXC_LOWER,
   361     UCASE_EXC_FOLD,
   362     UCASE_EXC_UPPER,
   363     UCASE_EXC_TITLE,
   364     UCASE_EXC_4,            /* reserved */
   365     UCASE_EXC_5,            /* reserved */
   366     UCASE_EXC_CLOSURE,
   367     UCASE_EXC_FULL_MAPPINGS,
   368     UCASE_EXC_ALL_SLOTS     /* one past the last slot */
   369 };
   371 /* each slot is 2 uint16_t instead of 1 */
   372 #define UCASE_EXC_DOUBLE_SLOTS      0x100
   374 /* reserved: exception bits 11..9 */
   376 /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
   377 #define UCASE_EXC_DOT_SHIFT     7
   379 /* normally stored in the main word, but pushed out for larger exception indexes */
   380 #define UCASE_EXC_DOT_MASK      0x3000
   381 enum {
   382     UCASE_EXC_NO_DOT=0,
   383     UCASE_EXC_SOFT_DOTTED=0x1000,
   384     UCASE_EXC_ABOVE=0x2000,         /* "above" accents with cc=230 */
   385     UCASE_EXC_OTHER_ACCENT=0x3000   /* other character (0<cc!=230) */
   386 };
   388 /* complex/conditional mappings */
   389 #define UCASE_EXC_CONDITIONAL_SPECIAL   0x4000
   390 #define UCASE_EXC_CONDITIONAL_FOLD      0x8000
   392 /* definitions for lengths word for full case mappings */
   393 #define UCASE_FULL_LOWER    0xf
   394 #define UCASE_FULL_FOLDING  0xf0
   395 #define UCASE_FULL_UPPER    0xf00
   396 #define UCASE_FULL_TITLE    0xf000
   398 /* maximum lengths */
   399 #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
   400 #define UCASE_CLOSURE_MAX_LENGTH 0xf
   402 /* constants for reverse case folding ("unfold") data */
   403 enum {
   404     UCASE_UNFOLD_ROWS,
   405     UCASE_UNFOLD_ROW_WIDTH,
   406     UCASE_UNFOLD_STRING_WIDTH
   407 };
   409 #endif

mercurial