intl/icu/source/common/ucase.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2004-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucase.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2004aug30
    14 *   created by: Markus W. Scherer
    15 *
    16 *   Low-level Unicode character/string case mapping code.
    17 *   Much code moved here (and modified) from uchar.c.
    18 */
    20 #include "unicode/utypes.h"
    21 #include "unicode/unistr.h"
    22 #include "unicode/uset.h"
    23 #include "unicode/udata.h" /* UDataInfo */
    24 #include "unicode/utf16.h"
    25 #include "ucmndata.h" /* DataHeader */
    26 #include "udatamem.h"
    27 #include "umutex.h"
    28 #include "uassert.h"
    29 #include "cmemory.h"
    30 #include "utrie2.h"
    31 #include "ucase.h"
    32 #include "ucln_cmn.h"
    34 struct UCaseProps {
    35     UDataMemory *mem;
    36     const int32_t *indexes;
    37     const uint16_t *exceptions;
    38     const uint16_t *unfold;
    40     UTrie2 trie;
    41     uint8_t formatVersion[4];
    42 };
    44 /* ucase_props_data.h is machine-generated by gencase --csource */
    45 #define INCLUDED_FROM_UCASE_CPP
    46 #include "ucase_props_data.h"
    48 /* UCaseProps singleton ----------------------------------------------------- */
    50 U_CAPI const UCaseProps * U_EXPORT2
    51 ucase_getSingleton() {
    52     return &ucase_props_singleton;
    53 }
    55 /* set of property starts for UnicodeSet ------------------------------------ */
    57 static UBool U_CALLCONV
    58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    59     /* add the start code point to the USet */
    60     const USetAdder *sa=(const USetAdder *)context;
    61     sa->add(sa->set, start);
    62     return TRUE;
    63 }
    65 U_CFUNC void U_EXPORT2
    66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
    67     if(U_FAILURE(*pErrorCode)) {
    68         return;
    69     }
    71     /* add the start code point of each same-value range of the trie */
    72     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
    74     /* add code points with hardcoded properties, plus the ones following them */
    76     /* (none right now, see comment below) */
    78     /*
    79      * Omit code points with hardcoded specialcasing properties
    80      * because we do not build property UnicodeSets for them right now.
    81      */
    82 }
    84 /* data access primitives --------------------------------------------------- */
    86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
    88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
    90 /* number of bits in an 8-bit integer value */
    91 static const uint8_t flagsOffset[256]={
    92     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    96     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
   100     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
   101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
   102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
   103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
   104     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
   105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
   106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
   107     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
   108 };
   110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
   111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
   113 /*
   114  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
   115  *
   116  * @param excWord (in) initial exceptions word
   117  * @param idx (in) desired slot index
   118  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
   119  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
   120  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
   121  */
   122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
   123     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
   124         (pExc16)+=SLOT_OFFSET(excWord, idx); \
   125         (value)=*pExc16; \
   126     } else { \
   127         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
   128         (value)=*pExc16++; \
   129         (value)=((value)<<16)|*pExc16; \
   130     }
   132 /* simple case mappings ----------------------------------------------------- */
   134 U_CAPI UChar32 U_EXPORT2
   135 ucase_tolower(const UCaseProps *csp, UChar32 c) {
   136     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   137     if(!PROPS_HAS_EXCEPTION(props)) {
   138         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   139             c+=UCASE_GET_DELTA(props);
   140         }
   141     } else {
   142         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   143         uint16_t excWord=*pe++;
   144         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   145             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
   146         }
   147     }
   148     return c;
   149 }
   151 U_CAPI UChar32 U_EXPORT2
   152 ucase_toupper(const UCaseProps *csp, UChar32 c) {
   153     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   154     if(!PROPS_HAS_EXCEPTION(props)) {
   155         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   156             c+=UCASE_GET_DELTA(props);
   157         }
   158     } else {
   159         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   160         uint16_t excWord=*pe++;
   161         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   162             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
   163         }
   164     }
   165     return c;
   166 }
   168 U_CAPI UChar32 U_EXPORT2
   169 ucase_totitle(const UCaseProps *csp, UChar32 c) {
   170     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   171     if(!PROPS_HAS_EXCEPTION(props)) {
   172         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   173             c+=UCASE_GET_DELTA(props);
   174         }
   175     } else {
   176         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   177         uint16_t excWord=*pe++;
   178         int32_t idx;
   179         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   180             idx=UCASE_EXC_TITLE;
   181         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   182             idx=UCASE_EXC_UPPER;
   183         } else {
   184             return c;
   185         }
   186         GET_SLOT_VALUE(excWord, idx, pe, c);
   187     }
   188     return c;
   189 }
   191 static const UChar iDot[2] = { 0x69, 0x307 };
   192 static const UChar jDot[2] = { 0x6a, 0x307 };
   193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
   194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
   195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
   196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
   199 U_CFUNC void U_EXPORT2
   200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
   201     uint16_t props;
   203     /*
   204      * Hardcode the case closure of i and its relatives and ignore the
   205      * data file data for these characters.
   206      * The Turkic dotless i and dotted I with their case mapping conditions
   207      * and case folding option make the related characters behave specially.
   208      * This code matches their closure behavior to their case folding behavior.
   209      */
   211     switch(c) {
   212     case 0x49:
   213         /* regular i and I are in one equivalence class */
   214         sa->add(sa->set, 0x69);
   215         return;
   216     case 0x69:
   217         sa->add(sa->set, 0x49);
   218         return;
   219     case 0x130:
   220         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
   221         sa->addString(sa->set, iDot, 2);
   222         return;
   223     case 0x131:
   224         /* dotless i is in a class by itself */
   225         return;
   226     default:
   227         /* otherwise use the data file data */
   228         break;
   229     }
   231     props=UTRIE2_GET16(&csp->trie, c);
   232     if(!PROPS_HAS_EXCEPTION(props)) {
   233         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
   234             /* add the one simple case mapping, no matter what type it is */
   235             int32_t delta=UCASE_GET_DELTA(props);
   236             if(delta!=0) {
   237                 sa->add(sa->set, c+delta);
   238             }
   239         }
   240     } else {
   241         /*
   242          * c has exceptions, so there may be multiple simple and/or
   243          * full case mappings. Add them all.
   244          */
   245         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
   246         const UChar *closure;
   247         uint16_t excWord=*pe++;
   248         int32_t idx, closureLength, fullLength, length;
   250         pe0=pe;
   252         /* add all simple case mappings */
   253         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
   254             if(HAS_SLOT(excWord, idx)) {
   255                 pe=pe0;
   256                 GET_SLOT_VALUE(excWord, idx, pe, c);
   257                 sa->add(sa->set, c);
   258             }
   259         }
   261         /* get the closure string pointer & length */
   262         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
   263             pe=pe0;
   264             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
   265             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
   266             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
   267         } else {
   268             closureLength=0;
   269             closure=NULL;
   270         }
   272         /* add the full case folding */
   273         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   274             pe=pe0;
   275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
   277             /* start of full case mapping strings */
   278             ++pe;
   280             fullLength&=0xffff; /* bits 16 and higher are reserved */
   282             /* skip the lowercase result string */
   283             pe+=fullLength&UCASE_FULL_LOWER;
   284             fullLength>>=4;
   286             /* add the full case folding string */
   287             length=fullLength&0xf;
   288             if(length!=0) {
   289                 sa->addString(sa->set, (const UChar *)pe, length);
   290                 pe+=length;
   291             }
   293             /* skip the uppercase and titlecase strings */
   294             fullLength>>=4;
   295             pe+=fullLength&0xf;
   296             fullLength>>=4;
   297             pe+=fullLength;
   299             closure=(const UChar *)pe; /* behind full case mappings */
   300         }
   302         /* add each code point in the closure string */
   303         for(idx=0; idx<closureLength;) {
   304             U16_NEXT_UNSAFE(closure, idx, c);
   305             sa->add(sa->set, c);
   306         }
   307     }
   308 }
   310 /*
   311  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
   312  * must be length>0 and max>0 and length<=max
   313  */
   314 static inline int32_t
   315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
   316     int32_t c1, c2;
   318     max-=length; /* we require length<=max, so no need to decrement max in the loop */
   319     do {
   320         c1=*s++;
   321         c2=*t++;
   322         if(c2==0) {
   323             return 1; /* reached the end of t but not of s */
   324         }
   325         c1-=c2;
   326         if(c1!=0) {
   327             return c1; /* return difference result */
   328         }
   329     } while(--length>0);
   330     /* ends with length==0 */
   332     if(max==0 || *t==0) {
   333         return 0; /* equal to length of both strings */
   334     } else {
   335         return -max; /* return lengh difference */
   336     }
   337 }
   339 U_CFUNC UBool U_EXPORT2
   340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
   341     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
   343     if(csp->unfold==NULL || s==NULL) {
   344         return FALSE; /* no reverse case folding data, or no string */
   345     }
   346     if(length<=1) {
   347         /* the string is too short to find any match */
   348         /*
   349          * more precise would be:
   350          * if(!u_strHasMoreChar32Than(s, length, 1))
   351          * but this does not make much practical difference because
   352          * a single supplementary code point would just not be found
   353          */
   354         return FALSE;
   355     }
   357     const uint16_t *unfold=csp->unfold;
   358     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
   359     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
   360     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
   361     unfold+=unfoldRowWidth;
   363     if(length>unfoldStringWidth) {
   364         /* the string is too long to find any match */
   365         return FALSE;
   366     }
   368     /* do a binary search for the string */
   369     start=0;
   370     limit=unfoldRows;
   371     while(start<limit) {
   372         i=(start+limit)/2;
   373         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
   374         result=strcmpMax(s, length, p, unfoldStringWidth);
   376         if(result==0) {
   377             /* found the string: add each code point, and its case closure */
   378             UChar32 c;
   380             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
   381                 U16_NEXT_UNSAFE(p, i, c);
   382                 sa->add(sa->set, c);
   383                 ucase_addCaseClosure(csp, c, sa);
   384             }
   385             return TRUE;
   386         } else if(result<0) {
   387             limit=i;
   388         } else /* result>0 */ {
   389             start=i+1;
   390         }
   391     }
   393     return FALSE; /* string not found */
   394 }
   396 U_NAMESPACE_BEGIN
   398 FullCaseFoldingIterator::FullCaseFoldingIterator()
   399         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
   400           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
   401           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
   402           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
   403           currentRow(0),
   404           rowCpIndex(unfoldStringWidth) {
   405     unfold+=unfoldRowWidth;
   406 }
   408 UChar32
   409 FullCaseFoldingIterator::next(UnicodeString &full) {
   410     // Advance past the last-delivered code point.
   411     const UChar *p=unfold+(currentRow*unfoldRowWidth);
   412     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
   413         ++currentRow;
   414         p+=unfoldRowWidth;
   415         rowCpIndex=unfoldStringWidth;
   416     }
   417     if(currentRow>=unfoldRows) { return U_SENTINEL; }
   418     // Set "full" to the NUL-terminated string in the first unfold column.
   419     int32_t length=unfoldStringWidth;
   420     while(length>0 && p[length-1]==0) { --length; }
   421     full.setTo(FALSE, p, length);
   422     // Return the code point.
   423     UChar32 c;
   424     U16_NEXT_UNSAFE(p, rowCpIndex, c);
   425     return c;
   426 }
   428 U_NAMESPACE_END
   430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
   431 U_CAPI int32_t U_EXPORT2
   432 ucase_getType(const UCaseProps *csp, UChar32 c) {
   433     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   434     return UCASE_GET_TYPE(props);
   435 }
   437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
   438 U_CAPI int32_t U_EXPORT2
   439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
   440     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   441     return UCASE_GET_TYPE_AND_IGNORABLE(props);
   442 }
   444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
   445 static inline int32_t
   446 getDotType(const UCaseProps *csp, UChar32 c) {
   447     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   448     if(!PROPS_HAS_EXCEPTION(props)) {
   449         return props&UCASE_DOT_MASK;
   450     } else {
   451         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   452         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
   453     }
   454 }
   456 U_CAPI UBool U_EXPORT2
   457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
   458     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
   459 }
   461 U_CAPI UBool U_EXPORT2
   462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
   463     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   464     return (UBool)((props&UCASE_SENSITIVE)!=0);
   465 }
   467 /* string casing ------------------------------------------------------------ */
   469 /*
   470  * These internal functions form the core of string case mappings.
   471  * They map single code points to result code points or strings and take
   472  * all necessary conditions (context, locale ID, options) into account.
   473  *
   474  * They do not iterate over the source or write to the destination
   475  * so that the same functions are useful for non-standard string storage,
   476  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
   477  * For the same reason, the "surrounding text" context is passed in as a
   478  * UCaseContextIterator which does not make any assumptions about
   479  * the underlying storage.
   480  *
   481  * This section contains helper functions that check for conditions
   482  * in the input text surrounding the current code point
   483  * according to SpecialCasing.txt.
   484  *
   485  * Each helper function gets the index
   486  * - after the current code point if it looks at following text
   487  * - before the current code point if it looks at preceding text
   488  *
   489  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
   490  *
   491  * Final_Sigma
   492  *   C is preceded by a sequence consisting of
   493  *     a cased letter and a case-ignorable sequence,
   494  *   and C is not followed by a sequence consisting of
   495  *     an ignorable sequence and then a cased letter.
   496  *
   497  * More_Above
   498  *   C is followed by one or more characters of combining class 230 (ABOVE)
   499  *   in the combining character sequence.
   500  *
   501  * After_Soft_Dotted
   502  *   The last preceding character with combining class of zero before C
   503  *   was Soft_Dotted,
   504  *   and there is no intervening combining character class 230 (ABOVE).
   505  *
   506  * Before_Dot
   507  *   C is followed by combining dot above (U+0307).
   508  *   Any sequence of characters with a combining class that is neither 0 nor 230
   509  *   may intervene between the current character and the combining dot above.
   510  *
   511  * The erratum from 2002-10-31 adds the condition
   512  *
   513  * After_I
   514  *   The last preceding base character was an uppercase I, and there is no
   515  *   intervening combining character class 230 (ABOVE).
   516  *
   517  *   (See Jitterbug 2344 and the comments on After_I below.)
   518  *
   519  * Helper definitions in Unicode 3.2 UAX 21:
   520  *
   521  * D1. A character C is defined to be cased
   522  *     if it meets any of the following criteria:
   523  *
   524  *   - The general category of C is Titlecase Letter (Lt)
   525  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
   526  *   - Given D = NFD(C), then it is not the case that:
   527  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
   528  *     (This third criterium does not add any characters to the list
   529  *      for Unicode 3.2. Ignored.)
   530  *
   531  * D2. A character C is defined to be case-ignorable
   532  *     if it meets either of the following criteria:
   533  *
   534  *   - The general category of C is
   535  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
   536  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
   537  *   - C is one of the following characters 
   538  *     U+0027 APOSTROPHE
   539  *     U+00AD SOFT HYPHEN (SHY)
   540  *     U+2019 RIGHT SINGLE QUOTATION MARK
   541  *            (the preferred character for apostrophe)
   542  *
   543  * D3. A case-ignorable sequence is a sequence of
   544  *     zero or more case-ignorable characters.
   545  */
   547 #define is_a(c) ((c)=='a' || (c)=='A')
   548 #define is_d(c) ((c)=='d' || (c)=='D')
   549 #define is_e(c) ((c)=='e' || (c)=='E')
   550 #define is_i(c) ((c)=='i' || (c)=='I')
   551 #define is_l(c) ((c)=='l' || (c)=='L')
   552 #define is_n(c) ((c)=='n' || (c)=='N')
   553 #define is_r(c) ((c)=='r' || (c)=='R')
   554 #define is_t(c) ((c)=='t' || (c)=='T')
   555 #define is_u(c) ((c)=='u' || (c)=='U')
   556 #define is_z(c) ((c)=='z' || (c)=='Z')
   558 /* separator? */
   559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
   561 /**
   562  * Requires non-NULL locale ID but otherwise does the equivalent of
   563  * checking for language codes as if uloc_getLanguage() were called:
   564  * Accepts both 2- and 3-letter codes and accepts case variants.
   565  */
   566 U_CFUNC int32_t
   567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
   568     int32_t result;
   569     char c;
   571     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
   572         return result;
   573     }
   575     result=UCASE_LOC_ROOT;
   577     /*
   578      * This function used to use uloc_getLanguage(), but the current code
   579      * removes the dependency of this low-level code on uloc implementation code
   580      * and is faster because not the whole locale ID has to be
   581      * examined and copied/transformed.
   582      *
   583      * Because this code does not want to depend on uloc, the caller must
   584      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
   585      */
   586     c=*locale++;
   587     if(is_t(c)) {
   588         /* tr or tur? */
   589         c=*locale++;
   590         if(is_u(c)) {
   591             c=*locale++;
   592         }
   593         if(is_r(c)) {
   594             c=*locale;
   595             if(is_sep(c)) {
   596                 result=UCASE_LOC_TURKISH;
   597             }
   598         }
   599     } else if(is_a(c)) {
   600         /* az or aze? */
   601         c=*locale++;
   602         if(is_z(c)) {
   603             c=*locale++;
   604             if(is_e(c)) {
   605                 c=*locale;
   606             }
   607             if(is_sep(c)) {
   608                 result=UCASE_LOC_TURKISH;
   609             }
   610         }
   611     } else if(is_l(c)) {
   612         /* lt or lit? */
   613         c=*locale++;
   614         if(is_i(c)) {
   615             c=*locale++;
   616         }
   617         if(is_t(c)) {
   618             c=*locale;
   619             if(is_sep(c)) {
   620                 result=UCASE_LOC_LITHUANIAN;
   621             }
   622         }
   623     } else if(is_n(c)) {
   624         /* nl or nld? */
   625         c=*locale++;
   626         if(is_l(c)) {
   627             c=*locale++;
   628             if(is_d(c)) {
   629                 c=*locale;
   630             }
   631             if(is_sep(c)) {
   632                 result=UCASE_LOC_DUTCH;
   633             }
   634         }
   635     }
   637     if(locCache!=NULL) {
   638         *locCache=result;
   639     }
   640     return result;
   641 }
   643 /*
   644  * Is followed by
   645  *   {case-ignorable}* cased
   646  * ?
   647  * (dir determines looking forward/backward)
   648  * If a character is case-ignorable, it is skipped regardless of whether
   649  * it is also cased or not.
   650  */
   651 static UBool
   652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
   653     UChar32 c;
   655     if(iter==NULL) {
   656         return FALSE;
   657     }
   659     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
   660         int32_t type=ucase_getTypeOrIgnorable(csp, c);
   661         if(type&4) {
   662             /* case-ignorable, continue with the loop */
   663         } else if(type!=UCASE_NONE) {
   664             return TRUE; /* followed by cased letter */
   665         } else {
   666             return FALSE; /* uncased and not case-ignorable */
   667         }
   668     }
   670     return FALSE; /* not followed by cased letter */
   671 }
   673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
   674 static UBool
   675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   676     UChar32 c;
   677     int32_t dotType;
   678     int8_t dir;
   680     if(iter==NULL) {
   681         return FALSE;
   682     }
   684     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
   685         dotType=getDotType(csp, c);
   686         if(dotType==UCASE_SOFT_DOTTED) {
   687             return TRUE; /* preceded by TYPE_i */
   688         } else if(dotType!=UCASE_OTHER_ACCENT) {
   689             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
   690         }
   691     }
   693     return FALSE; /* not preceded by TYPE_i */
   694 }
   696 /*
   697  * See Jitterbug 2344:
   698  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
   699  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
   700  * we made those releases compatible with Unicode 3.2 which had not fixed
   701  * a related bug in SpecialCasing.txt.
   702  *
   703  * From the Jitterbug 2344 text:
   704  * ... this bug is listed as a Unicode erratum
   705  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
   706  * <quote>
   707  * There are two errors in SpecialCasing.txt.
   708  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
   709  * 2. An incorrect context definition. Correct as follows:
   710  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
   711  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
   712  * ---
   713  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   714  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
   715  * where the context After_I is defined as:
   716  * The last preceding base character was an uppercase I, and there is no
   717  * intervening combining character class 230 (ABOVE).
   718  * </quote>
   719  *
   720  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
   721  *
   722  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   723  * # This matches the behavior of the canonically equivalent I-dot_above
   724  *
   725  * See also the description in this place in older versions of uchar.c (revision 1.100).
   726  *
   727  * Markus W. Scherer 2003-feb-15
   728  */
   730 /* Is preceded by base character 'I' with no intervening cc=230 ? */
   731 static UBool
   732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   733     UChar32 c;
   734     int32_t dotType;
   735     int8_t dir;
   737     if(iter==NULL) {
   738         return FALSE;
   739     }
   741     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
   742         if(c==0x49) {
   743             return TRUE; /* preceded by I */
   744         }
   745         dotType=getDotType(csp, c);
   746         if(dotType!=UCASE_OTHER_ACCENT) {
   747             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
   748         }
   749     }
   751     return FALSE; /* not preceded by I */
   752 }
   754 /* Is followed by one or more cc==230 ? */
   755 static UBool
   756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   757     UChar32 c;
   758     int32_t dotType;
   759     int8_t dir;
   761     if(iter==NULL) {
   762         return FALSE;
   763     }
   765     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   766         dotType=getDotType(csp, c);
   767         if(dotType==UCASE_ABOVE) {
   768             return TRUE; /* at least one cc==230 following */
   769         } else if(dotType!=UCASE_OTHER_ACCENT) {
   770             return FALSE; /* next base character, no more cc==230 following */
   771         }
   772     }
   774     return FALSE; /* no more cc==230 following */
   775 }
   777 /* Is followed by a dot above (without cc==230 in between) ? */
   778 static UBool
   779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   780     UChar32 c;
   781     int32_t dotType;
   782     int8_t dir;
   784     if(iter==NULL) {
   785         return FALSE;
   786     }
   788     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   789         if(c==0x307) {
   790             return TRUE;
   791         }
   792         dotType=getDotType(csp, c);
   793         if(dotType!=UCASE_OTHER_ACCENT) {
   794             return FALSE; /* next base character or cc==230 in between */
   795         }
   796     }
   798     return FALSE; /* no dot above following */
   799 }
   801 U_CAPI int32_t U_EXPORT2
   802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
   803                   UCaseContextIterator *iter, void *context,
   804                   const UChar **pString,
   805                   const char *locale, int32_t *locCache)
   806 {
   807     UChar32 result=c;
   808     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   809     if(!PROPS_HAS_EXCEPTION(props)) {
   810         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   811             result=c+UCASE_GET_DELTA(props);
   812         }
   813     } else {
   814         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   815         uint16_t excWord=*pe++;
   816         int32_t full;
   818         pe2=pe;
   820         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   821             /* use hardcoded conditions and mappings */
   822             int32_t loc=ucase_getCaseLocale(locale, locCache);
   824             /*
   825              * Test for conditional mappings first
   826              *   (otherwise the unconditional default mappings are always taken),
   827              * then test for characters that have unconditional mappings in SpecialCasing.txt,
   828              * then get the UnicodeData.txt mappings.
   829              */
   830             if( loc==UCASE_LOC_LITHUANIAN &&
   831                     /* base characters, find accents above */
   832                     (((c==0x49 || c==0x4a || c==0x12e) &&
   833                         isFollowedByMoreAbove(csp, iter, context)) ||
   834                     /* precomposed with accent above, no need to find one */
   835                     (c==0xcc || c==0xcd || c==0x128))
   836             ) {
   837                 /*
   838                     # Lithuanian
   840                     # Lithuanian retains the dot in a lowercase i when followed by accents.
   842                     # Introduce an explicit dot above when lowercasing capital I's and J's
   843                     # whenever there are more accents above.
   844                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
   846                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
   847                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
   848                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
   849                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
   850                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
   851                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
   852                  */
   853                 switch(c) {
   854                 case 0x49:  /* LATIN CAPITAL LETTER I */
   855                     *pString=iDot;
   856                     return 2;
   857                 case 0x4a:  /* LATIN CAPITAL LETTER J */
   858                     *pString=jDot;
   859                     return 2;
   860                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
   861                     *pString=iOgonekDot;
   862                     return 2;
   863                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
   864                     *pString=iDotGrave;
   865                     return 3;
   866                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
   867                     *pString=iDotAcute;
   868                     return 3;
   869                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
   870                     *pString=iDotTilde;
   871                     return 3;
   872                 default:
   873                     return 0; /* will not occur */
   874                 }
   875             /* # Turkish and Azeri */
   876             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
   877                 /*
   878                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   879                     # The following rules handle those cases.
   881                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
   882                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
   883                  */
   884                 return 0x69;
   885             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
   886                 /*
   887                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   888                     # This matches the behavior of the canonically equivalent I-dot_above
   890                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   891                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
   892                  */
   893                 return 0; /* remove the dot (continue without output) */
   894             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
   895                 /*
   896                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
   898                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
   899                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
   900                  */
   901                 return 0x131;
   902             } else if(c==0x130) {
   903                 /*
   904                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
   906                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   907                  */
   908                 *pString=iDot;
   909                 return 2;
   910             } else if(  c==0x3a3 &&
   911                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
   912                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
   913             ) {
   914                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
   915                 /*
   916                     # Special case for final form of sigma
   918                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
   919                  */
   920                 return 0x3c2; /* greek small final sigma */
   921             } else {
   922                 /* no known conditional special case mapping, use a normal mapping */
   923             }
   924         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   925             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   926             full&=UCASE_FULL_LOWER;
   927             if(full!=0) {
   928                 /* set the output pointer to the lowercase mapping */
   929                 *pString=reinterpret_cast<const UChar *>(pe+1);
   931                 /* return the string length */
   932                 return full;
   933             }
   934         }
   936         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   937             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
   938         }
   939     }
   941     return (result==c) ? ~result : result;
   942 }
   944 /* internal */
   945 static int32_t
   946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
   947                UCaseContextIterator *iter, void *context,
   948                const UChar **pString,
   949                const char *locale, int32_t *locCache,
   950                UBool upperNotTitle) {
   951     UChar32 result=c;
   952     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   953     if(!PROPS_HAS_EXCEPTION(props)) {
   954         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   955             result=c+UCASE_GET_DELTA(props);
   956         }
   957     } else {
   958         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   959         uint16_t excWord=*pe++;
   960         int32_t full, idx;
   962         pe2=pe;
   964         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   965             /* use hardcoded conditions and mappings */
   966             int32_t loc=ucase_getCaseLocale(locale, locCache);
   968             if(loc==UCASE_LOC_TURKISH && c==0x69) {
   969                 /*
   970                     # Turkish and Azeri
   972                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   973                     # The following rules handle those cases.
   975                     # When uppercasing, i turns into a dotted capital I
   977                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   978                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   979                 */
   980                 return 0x130;
   981             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
   982                 /*
   983                     # Lithuanian
   985                     # Lithuanian retains the dot in a lowercase i when followed by accents.
   987                     # Remove DOT ABOVE after "i" with upper or titlecase
   989                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   990                  */
   991                 return 0; /* remove the dot (continue without output) */
   992             } else {
   993                 /* no known conditional special case mapping, use a normal mapping */
   994             }
   995         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   996             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   998             /* start of full case mapping strings */
   999             ++pe;
  1001             /* skip the lowercase and case-folding result strings */
  1002             pe+=full&UCASE_FULL_LOWER;
  1003             full>>=4;
  1004             pe+=full&0xf;
  1005             full>>=4;
  1007             if(upperNotTitle) {
  1008                 full&=0xf;
  1009             } else {
  1010                 /* skip the uppercase result string */
  1011                 pe+=full&0xf;
  1012                 full=(full>>4)&0xf;
  1015             if(full!=0) {
  1016                 /* set the output pointer to the result string */
  1017                 *pString=reinterpret_cast<const UChar *>(pe);
  1019                 /* return the string length */
  1020                 return full;
  1024         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
  1025             idx=UCASE_EXC_TITLE;
  1026         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
  1027             /* here, titlecase is same as uppercase */
  1028             idx=UCASE_EXC_UPPER;
  1029         } else {
  1030             return ~c;
  1032         GET_SLOT_VALUE(excWord, idx, pe2, result);
  1035     return (result==c) ? ~result : result;
  1038 U_CAPI int32_t U_EXPORT2
  1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
  1040                   UCaseContextIterator *iter, void *context,
  1041                   const UChar **pString,
  1042                   const char *locale, int32_t *locCache) {
  1043     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
  1046 U_CAPI int32_t U_EXPORT2
  1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
  1048                   UCaseContextIterator *iter, void *context,
  1049                   const UChar **pString,
  1050                   const char *locale, int32_t *locCache) {
  1051     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
  1054 /* case folding ------------------------------------------------------------- */
  1056 /*
  1057  * Case folding is similar to lowercasing.
  1058  * The result may be a simple mapping, i.e., a single code point, or
  1059  * a full mapping, i.e., a string.
  1060  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
  1061  * then only the lowercase mapping is stored.
  1063  * Some special cases are hardcoded because their conditions cannot be
  1064  * parsed and processed from CaseFolding.txt.
  1066  * Unicode 3.2 CaseFolding.txt specifies for its status field:
  1068 # C: common case folding, common mappings shared by both simple and full mappings.
  1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
  1070 # S: simple case folding, mappings to single characters where different from F.
  1071 # T: special case for uppercase I and dotted uppercase I
  1072 #    - For non-Turkic languages, this mapping is normally not used.
  1073 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
  1075 # Usage:
  1076 #  A. To do a simple case folding, use the mappings with status C + S.
  1077 #  B. To do a full case folding, use the mappings with status C + F.
  1079 #    The mappings with status T can be used or omitted depending on the desired case-folding
  1080 #    behavior. (The default option is to exclude them.)
  1082  * Unicode 3.2 has 'T' mappings as follows:
  1084 0049; T; 0131; # LATIN CAPITAL LETTER I
  1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  1087  * while the default mappings for these code points are:
  1089 0049; C; 0069; # LATIN CAPITAL LETTER I
  1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  1092  * U+0130 has no simple case folding (simple-case-folds to itself).
  1093  */
  1095 /* return the simple case folding mapping for c */
  1096 U_CAPI UChar32 U_EXPORT2
  1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
  1098     uint16_t props=UTRIE2_GET16(&csp->trie, c);
  1099     if(!PROPS_HAS_EXCEPTION(props)) {
  1100         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
  1101             c+=UCASE_GET_DELTA(props);
  1103     } else {
  1104         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
  1105         uint16_t excWord=*pe++;
  1106         int32_t idx;
  1107         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
  1108             /* special case folding mappings, hardcoded */
  1109             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
  1110                 /* default mappings */
  1111                 if(c==0x49) {
  1112                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
  1113                     return 0x69;
  1114                 } else if(c==0x130) {
  1115                     /* no simple case folding for U+0130 */
  1116                     return c;
  1118             } else {
  1119                 /* Turkic mappings */
  1120                 if(c==0x49) {
  1121                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
  1122                     return 0x131;
  1123                 } else if(c==0x130) {
  1124                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
  1125                     return 0x69;
  1129         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
  1130             idx=UCASE_EXC_FOLD;
  1131         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
  1132             idx=UCASE_EXC_LOWER;
  1133         } else {
  1134             return c;
  1136         GET_SLOT_VALUE(excWord, idx, pe, c);
  1138     return c;
  1141 /*
  1142  * Issue for canonical caseless match (UAX #21):
  1143  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
  1144  * canonical equivalence, unlike default-option casefolding.
  1145  * For example, I-grave and I + grave fold to strings that are not canonically
  1146  * equivalent.
  1147  * For more details, see the comment in unorm_compare() in unorm.cpp
  1148  * and the intermediate prototype changes for Jitterbug 2021.
  1149  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
  1151  * This did not get fixed because it appears that it is not possible to fix
  1152  * it for uppercase and lowercase characters (I-grave vs. i-grave)
  1153  * together in a way that they still fold to common result strings.
  1154  */
  1156 U_CAPI int32_t U_EXPORT2
  1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
  1158                     const UChar **pString,
  1159                     uint32_t options)
  1161     UChar32 result=c;
  1162     uint16_t props=UTRIE2_GET16(&csp->trie, c);
  1163     if(!PROPS_HAS_EXCEPTION(props)) {
  1164         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
  1165             result=c+UCASE_GET_DELTA(props);
  1167     } else {
  1168         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
  1169         uint16_t excWord=*pe++;
  1170         int32_t full, idx;
  1172         pe2=pe;
  1174         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
  1175             /* use hardcoded conditions and mappings */
  1176             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
  1177                 /* default mappings */
  1178                 if(c==0x49) {
  1179                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
  1180                     return 0x69;
  1181                 } else if(c==0x130) {
  1182                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
  1183                     *pString=iDot;
  1184                     return 2;
  1186             } else {
  1187                 /* Turkic mappings */
  1188                 if(c==0x49) {
  1189                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
  1190                     return 0x131;
  1191                 } else if(c==0x130) {
  1192                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
  1193                     return 0x69;
  1196         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
  1197             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
  1199             /* start of full case mapping strings */
  1200             ++pe;
  1202             /* skip the lowercase result string */
  1203             pe+=full&UCASE_FULL_LOWER;
  1204             full=(full>>4)&0xf;
  1206             if(full!=0) {
  1207                 /* set the output pointer to the result string */
  1208                 *pString=reinterpret_cast<const UChar *>(pe);
  1210                 /* return the string length */
  1211                 return full;
  1215         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
  1216             idx=UCASE_EXC_FOLD;
  1217         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
  1218             idx=UCASE_EXC_LOWER;
  1219         } else {
  1220             return ~c;
  1222         GET_SLOT_VALUE(excWord, idx, pe2, result);
  1225     return (result==c) ? ~result : result;
  1228 /* case mapping properties API ---------------------------------------------- */
  1230 #define GET_CASE_PROPS() &ucase_props_singleton
  1232 /* public API (see uchar.h) */
  1234 U_CAPI UBool U_EXPORT2
  1235 u_isULowercase(UChar32 c) {
  1236     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
  1239 U_CAPI UBool U_EXPORT2
  1240 u_isUUppercase(UChar32 c) {
  1241     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
  1244 /* Transforms the Unicode character to its lower case equivalent.*/
  1245 U_CAPI UChar32 U_EXPORT2
  1246 u_tolower(UChar32 c) {
  1247     return ucase_tolower(GET_CASE_PROPS(), c);
  1250 /* Transforms the Unicode character to its upper case equivalent.*/
  1251 U_CAPI UChar32 U_EXPORT2
  1252 u_toupper(UChar32 c) {
  1253     return ucase_toupper(GET_CASE_PROPS(), c);
  1256 /* Transforms the Unicode character to its title case equivalent.*/
  1257 U_CAPI UChar32 U_EXPORT2
  1258 u_totitle(UChar32 c) {
  1259     return ucase_totitle(GET_CASE_PROPS(), c);
  1262 /* return the simple case folding mapping for c */
  1263 U_CAPI UChar32 U_EXPORT2
  1264 u_foldCase(UChar32 c, uint32_t options) {
  1265     return ucase_fold(GET_CASE_PROPS(), c, options);
  1268 U_CFUNC int32_t U_EXPORT2
  1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
  1270     /* case mapping properties */
  1271     const UChar *resultString;
  1272     int32_t locCache;
  1273     const UCaseProps *csp=GET_CASE_PROPS();
  1274     if(csp==NULL) {
  1275         return FALSE;
  1277     switch(which) {
  1278     case UCHAR_LOWERCASE:
  1279         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
  1280     case UCHAR_UPPERCASE:
  1281         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
  1282     case UCHAR_SOFT_DOTTED:
  1283         return ucase_isSoftDotted(csp, c);
  1284     case UCHAR_CASE_SENSITIVE:
  1285         return ucase_isCaseSensitive(csp, c);
  1286     case UCHAR_CASED:
  1287         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
  1288     case UCHAR_CASE_IGNORABLE:
  1289         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
  1290     /*
  1291      * Note: The following Changes_When_Xyz are defined as testing whether
  1292      * the NFD form of the input changes when Xyz-case-mapped.
  1293      * However, this simpler implementation of these properties,
  1294      * ignoring NFD, passes the tests.
  1295      * The implementation needs to be changed if the tests start failing.
  1296      * When that happens, optimizations should be used to work with the
  1297      * per-single-code point ucase_toFullXyz() functions unless
  1298      * the NFD form has more than one code point,
  1299      * and the property starts set needs to be the union of the
  1300      * start sets for normalization and case mappings.
  1301      */
  1302     case UCHAR_CHANGES_WHEN_LOWERCASED:
  1303         locCache=UCASE_LOC_ROOT;
  1304         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
  1305     case UCHAR_CHANGES_WHEN_UPPERCASED:
  1306         locCache=UCASE_LOC_ROOT;
  1307         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
  1308     case UCHAR_CHANGES_WHEN_TITLECASED:
  1309         locCache=UCASE_LOC_ROOT;
  1310         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
  1311     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
  1312     case UCHAR_CHANGES_WHEN_CASEMAPPED:
  1313         locCache=UCASE_LOC_ROOT;
  1314         return (UBool)(
  1315             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
  1316             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
  1317             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
  1318     default:
  1319         return FALSE;

mercurial