The Tor Browser: intl/icu/source/common/ucase.cpp@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2 *******************************************************************************

3 *

     4 *   Copyright (C) 2004-2012, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 *******************************************************************************

     8 *   file name:  ucase.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2004aug30

    14 *   created by: Markus W. Scherer

    15 *

    16 *   Low-level Unicode character/string case mapping code.

    17 *   Much code moved here (and modified) from uchar.c.

    18 */

    20 #include "unicode/utypes.h"

    21 #include "unicode/unistr.h"

    22 #include "unicode/uset.h"

    23 #include "unicode/udata.h" /* UDataInfo */

    24 #include "unicode/utf16.h"

    25 #include "ucmndata.h" /* DataHeader */

    26 #include "udatamem.h"

    27 #include "umutex.h"

    28 #include "uassert.h"

    29 #include "cmemory.h"

    30 #include "utrie2.h"

    31 #include "ucase.h"

    32 #include "ucln_cmn.h"

    34 struct UCaseProps {

    35     UDataMemory *mem;

    36     const int32_t *indexes;

    37     const uint16_t *exceptions;

    38     const uint16_t *unfold;

    40     UTrie2 trie;

    41     uint8_t formatVersion[4];

    42 };

    44 /* ucase_props_data.h is machine-generated by gencase --csource */

    45 #define INCLUDED_FROM_UCASE_CPP

    46 #include "ucase_props_data.h"

    48 /* UCaseProps singleton ----------------------------------------------------- */

    50 U_CAPI const UCaseProps * U_EXPORT2

    51 ucase_getSingleton() {

    52     return &ucase_props_singleton;

    53 }

    55 /* set of property starts for UnicodeSet ------------------------------------ */

    57 static UBool U_CALLCONV

    58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {

    59     /* add the start code point to the USet */

    60     const USetAdder *sa=(const USetAdder *)context;

    61     sa->add(sa->set, start);

    62     return TRUE;

    63 }

    65 U_CFUNC void U_EXPORT2

    66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {

    67     if(U_FAILURE(*pErrorCode)) {

    68         return;

    69     }

    71     /* add the start code point of each same-value range of the trie */

    72     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);

    74     /* add code points with hardcoded properties, plus the ones following them */

    76     /* (none right now, see comment below) */

    78     /*

    79      * Omit code points with hardcoded specialcasing properties

    80      * because we do not build property UnicodeSets for them right now.

    81      */

    82 }

    84 /* data access primitives --------------------------------------------------- */

    86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))

    88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)

    90 /* number of bits in an 8-bit integer value */

    91 static const uint8_t flagsOffset[256]={

    92     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,

    93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    96     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

    97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

    99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

   100     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

   101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

   102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

   103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

   104     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

   105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

   106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

   107     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8

   108 };

   110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))

   111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]

   113 /*

   114  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).

   115  *

   116  * @param excWord (in) initial exceptions word

   117  * @param idx (in) desired slot index

   118  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;

   119  *               moved to the last uint16_t of the value, use +1 for beginning of next slot

   120  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified

   121  */

   122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \

   123     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \

   124         (pExc16)+=SLOT_OFFSET(excWord, idx); \

   125         (value)=*pExc16; \

   126     } else { \

   127         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \

   128         (value)=*pExc16++; \

   129         (value)=((value)<<16)|*pExc16; \

   130     }

   132 /* simple case mappings ----------------------------------------------------- */

   134 U_CAPI UChar32 U_EXPORT2

   135 ucase_tolower(const UCaseProps *csp, UChar32 c) {

   136     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   137     if(!PROPS_HAS_EXCEPTION(props)) {

   138         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

   139             c+=UCASE_GET_DELTA(props);

   140         }

   141     } else {

   142         const uint16_t *pe=GET_EXCEPTIONS(csp, props);

   143         uint16_t excWord=*pe++;

   144         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

   145             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);

   146         }

   147     }

   148     return c;

   149 }

   151 U_CAPI UChar32 U_EXPORT2

   152 ucase_toupper(const UCaseProps *csp, UChar32 c) {

   153     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   154     if(!PROPS_HAS_EXCEPTION(props)) {

   155         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

   156             c+=UCASE_GET_DELTA(props);

   157         }

   158     } else {

   159         const uint16_t *pe=GET_EXCEPTIONS(csp, props);

   160         uint16_t excWord=*pe++;

   161         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

   162             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);

   163         }

   164     }

   165     return c;

   166 }

   168 U_CAPI UChar32 U_EXPORT2

   169 ucase_totitle(const UCaseProps *csp, UChar32 c) {

   170     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   171     if(!PROPS_HAS_EXCEPTION(props)) {

   172         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

   173             c+=UCASE_GET_DELTA(props);

   174         }

   175     } else {

   176         const uint16_t *pe=GET_EXCEPTIONS(csp, props);

   177         uint16_t excWord=*pe++;

   178         int32_t idx;

   179         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {

   180             idx=UCASE_EXC_TITLE;

   181         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

   182             idx=UCASE_EXC_UPPER;

   183         } else {

   184             return c;

   185         }

   186         GET_SLOT_VALUE(excWord, idx, pe, c);

   187     }

   188     return c;

   189 }

   191 static const UChar iDot[2] = { 0x69, 0x307 };

   192 static const UChar jDot[2] = { 0x6a, 0x307 };

   193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };

   194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };

   195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };

   196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };

   199 U_CFUNC void U_EXPORT2

   200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {

   201     uint16_t props;

   203     /*

   204      * Hardcode the case closure of i and its relatives and ignore the

   205      * data file data for these characters.

   206      * The Turkic dotless i and dotted I with their case mapping conditions

   207      * and case folding option make the related characters behave specially.

   208      * This code matches their closure behavior to their case folding behavior.

   209      */

   211     switch(c) {

   212     case 0x49:

   213         /* regular i and I are in one equivalence class */

   214         sa->add(sa->set, 0x69);

   215         return;

   216     case 0x69:

   217         sa->add(sa->set, 0x49);

   218         return;

   219     case 0x130:

   220         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */

   221         sa->addString(sa->set, iDot, 2);

   222         return;

   223     case 0x131:

   224         /* dotless i is in a class by itself */

   225         return;

   226     default:

   227         /* otherwise use the data file data */

   228         break;

   229     }

   231     props=UTRIE2_GET16(&csp->trie, c);

   232     if(!PROPS_HAS_EXCEPTION(props)) {

   233         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {

   234             /* add the one simple case mapping, no matter what type it is */

   235             int32_t delta=UCASE_GET_DELTA(props);

   236             if(delta!=0) {

   237                 sa->add(sa->set, c+delta);

   238             }

   239         }

   240     } else {

   241         /*

   242          * c has exceptions, so there may be multiple simple and/or

   243          * full case mappings. Add them all.

   244          */

   245         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);

   246         const UChar *closure;

   247         uint16_t excWord=*pe++;

   248         int32_t idx, closureLength, fullLength, length;

   250         pe0=pe;

   252         /* add all simple case mappings */

   253         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {

   254             if(HAS_SLOT(excWord, idx)) {

   255                 pe=pe0;

   256                 GET_SLOT_VALUE(excWord, idx, pe, c);

   257                 sa->add(sa->set, c);

   258             }

   259         }

   261         /* get the closure string pointer & length */

   262         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {

   263             pe=pe0;

   264             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);

   265             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */

   266             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */

   267         } else {

   268             closureLength=0;

   269             closure=NULL;

   270         }

   272         /* add the full case folding */

   273         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

   274             pe=pe0;

   275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);

   277             /* start of full case mapping strings */

   278             ++pe;

   280             fullLength&=0xffff; /* bits 16 and higher are reserved */

   282             /* skip the lowercase result string */

   283             pe+=fullLength&UCASE_FULL_LOWER;

   284             fullLength>>=4;

   286             /* add the full case folding string */

   287             length=fullLength&0xf;

   288             if(length!=0) {

   289                 sa->addString(sa->set, (const UChar *)pe, length);

   290                 pe+=length;

   291             }

   293             /* skip the uppercase and titlecase strings */

   294             fullLength>>=4;

   295             pe+=fullLength&0xf;

   296             fullLength>>=4;

   297             pe+=fullLength;

   299             closure=(const UChar *)pe; /* behind full case mappings */

   300         }

   302         /* add each code point in the closure string */

   303         for(idx=0; idx<closureLength;) {

   304             U16_NEXT_UNSAFE(closure, idx, c);

   305             sa->add(sa->set, c);

   306         }

   307     }

   308 }

   310 /*

   311  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated

   312  * must be length>0 and max>0 and length<=max

   313  */

   314 static inline int32_t

   315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {

   316     int32_t c1, c2;

   318     max-=length; /* we require length<=max, so no need to decrement max in the loop */

   319     do {

   320         c1=*s++;

   321         c2=*t++;

   322         if(c2==0) {

   323             return 1; /* reached the end of t but not of s */

   324         }

   325         c1-=c2;

   326         if(c1!=0) {

   327             return c1; /* return difference result */

   328         }

   329     } while(--length>0);

   330     /* ends with length==0 */

   332     if(max==0 || *t==0) {

   333         return 0; /* equal to length of both strings */

   334     } else {

   335         return -max; /* return lengh difference */

   336     }

   337 }

   339 U_CFUNC UBool U_EXPORT2

   340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {

   341     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;

   343     if(csp->unfold==NULL || s==NULL) {

   344         return FALSE; /* no reverse case folding data, or no string */

   345     }

   346     if(length<=1) {

   347         /* the string is too short to find any match */

   348         /*

   349          * more precise would be:

   350          * if(!u_strHasMoreChar32Than(s, length, 1))

   351          * but this does not make much practical difference because

   352          * a single supplementary code point would just not be found

   353          */

   354         return FALSE;

   355     }

   357     const uint16_t *unfold=csp->unfold;

   358     unfoldRows=unfold[UCASE_UNFOLD_ROWS];

   359     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];

   360     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];

   361     unfold+=unfoldRowWidth;

   363     if(length>unfoldStringWidth) {

   364         /* the string is too long to find any match */

   365         return FALSE;

   366     }

   368     /* do a binary search for the string */

   369     start=0;

   370     limit=unfoldRows;

   371     while(start<limit) {

   372         i=(start+limit)/2;

   373         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));

   374         result=strcmpMax(s, length, p, unfoldStringWidth);

   376         if(result==0) {

   377             /* found the string: add each code point, and its case closure */

   378             UChar32 c;

   380             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {

   381                 U16_NEXT_UNSAFE(p, i, c);

   382                 sa->add(sa->set, c);

   383                 ucase_addCaseClosure(csp, c, sa);

   384             }

   385             return TRUE;

   386         } else if(result<0) {

   387             limit=i;

   388         } else /* result>0 */ {

   389             start=i+1;

   390         }

   391     }

   393     return FALSE; /* string not found */

   394 }

   396 U_NAMESPACE_BEGIN

   398 FullCaseFoldingIterator::FullCaseFoldingIterator()

   399         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),

   400           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),

   401           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),

   402           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),

   403           currentRow(0),

   404           rowCpIndex(unfoldStringWidth) {

   405     unfold+=unfoldRowWidth;

   406 }

   408 UChar32

   409 FullCaseFoldingIterator::next(UnicodeString &full) {

   410     // Advance past the last-delivered code point.

   411     const UChar *p=unfold+(currentRow*unfoldRowWidth);

   412     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {

   413         ++currentRow;

   414         p+=unfoldRowWidth;

   415         rowCpIndex=unfoldStringWidth;

   416     }

   417     if(currentRow>=unfoldRows) { return U_SENTINEL; }

   418     // Set "full" to the NUL-terminated string in the first unfold column.

   419     int32_t length=unfoldStringWidth;

   420     while(length>0 && p[length-1]==0) { --length; }

   421     full.setTo(FALSE, p, length);

   422     // Return the code point.

   423     UChar32 c;

   424     U16_NEXT_UNSAFE(p, rowCpIndex, c);

   425     return c;

   426 }

   428 U_NAMESPACE_END

   430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */

   431 U_CAPI int32_t U_EXPORT2

   432 ucase_getType(const UCaseProps *csp, UChar32 c) {

   433     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   434     return UCASE_GET_TYPE(props);

   435 }

   437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */

   438 U_CAPI int32_t U_EXPORT2

   439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {

   440     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   441     return UCASE_GET_TYPE_AND_IGNORABLE(props);

   442 }

   444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */

   445 static inline int32_t

   446 getDotType(const UCaseProps *csp, UChar32 c) {

   447     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   448     if(!PROPS_HAS_EXCEPTION(props)) {

   449         return props&UCASE_DOT_MASK;

   450     } else {

   451         const uint16_t *pe=GET_EXCEPTIONS(csp, props);

   452         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;

   453     }

   454 }

   456 U_CAPI UBool U_EXPORT2

   457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {

   458     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);

   459 }

   461 U_CAPI UBool U_EXPORT2

   462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {

   463     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   464     return (UBool)((props&UCASE_SENSITIVE)!=0);

   465 }

   467 /* string casing ------------------------------------------------------------ */

   469 /*

   470  * These internal functions form the core of string case mappings.

   471  * They map single code points to result code points or strings and take

   472  * all necessary conditions (context, locale ID, options) into account.

   473  *

   474  * They do not iterate over the source or write to the destination

   475  * so that the same functions are useful for non-standard string storage,

   476  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.

   477  * For the same reason, the "surrounding text" context is passed in as a

   478  * UCaseContextIterator which does not make any assumptions about

   479  * the underlying storage.

   480  *

   481  * This section contains helper functions that check for conditions

   482  * in the input text surrounding the current code point

   483  * according to SpecialCasing.txt.

   484  *

   485  * Each helper function gets the index

   486  * - after the current code point if it looks at following text

   487  * - before the current code point if it looks at preceding text

   488  *

   489  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:

   490  *

   491  * Final_Sigma

   492  *   C is preceded by a sequence consisting of

   493  *     a cased letter and a case-ignorable sequence,

   494  *   and C is not followed by a sequence consisting of

   495  *     an ignorable sequence and then a cased letter.

   496  *

   497  * More_Above

   498  *   C is followed by one or more characters of combining class 230 (ABOVE)

   499  *   in the combining character sequence.

   500  *

   501  * After_Soft_Dotted

   502  *   The last preceding character with combining class of zero before C

   503  *   was Soft_Dotted,

   504  *   and there is no intervening combining character class 230 (ABOVE).

   505  *

   506  * Before_Dot

   507  *   C is followed by combining dot above (U+0307).

   508  *   Any sequence of characters with a combining class that is neither 0 nor 230

   509  *   may intervene between the current character and the combining dot above.

   510  *

   511  * The erratum from 2002-10-31 adds the condition

   512  *

   513  * After_I

   514  *   The last preceding base character was an uppercase I, and there is no

   515  *   intervening combining character class 230 (ABOVE).

   516  *

   517  *   (See Jitterbug 2344 and the comments on After_I below.)

   518  *

   519  * Helper definitions in Unicode 3.2 UAX 21:

   520  *

   521  * D1. A character C is defined to be cased

   522  *     if it meets any of the following criteria:

   523  *

   524  *   - The general category of C is Titlecase Letter (Lt)

   525  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase

   526  *   - Given D = NFD(C), then it is not the case that:

   527  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)

   528  *     (This third criterium does not add any characters to the list

   529  *      for Unicode 3.2. Ignored.)

   530  *

   531  * D2. A character C is defined to be case-ignorable

   532  *     if it meets either of the following criteria:

   533  *

   534  *   - The general category of C is

   535  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or

   536  *     Letter Modifier (Lm), or Symbol Modifier (Sk)

   537  *   - C is one of the following characters

   538  *     U+0027 APOSTROPHE

   539  *     U+00AD SOFT HYPHEN (SHY)

   540  *     U+2019 RIGHT SINGLE QUOTATION MARK

   541  *            (the preferred character for apostrophe)

   542  *

   543  * D3. A case-ignorable sequence is a sequence of

   544  *     zero or more case-ignorable characters.

   545  */

   547 #define is_a(c) ((c)=='a' || (c)=='A')

   548 #define is_d(c) ((c)=='d' || (c)=='D')

   549 #define is_e(c) ((c)=='e' || (c)=='E')

   550 #define is_i(c) ((c)=='i' || (c)=='I')

   551 #define is_l(c) ((c)=='l' || (c)=='L')

   552 #define is_n(c) ((c)=='n' || (c)=='N')

   553 #define is_r(c) ((c)=='r' || (c)=='R')

   554 #define is_t(c) ((c)=='t' || (c)=='T')

   555 #define is_u(c) ((c)=='u' || (c)=='U')

   556 #define is_z(c) ((c)=='z' || (c)=='Z')

   558 /* separator? */

   559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)

   561 /**

   562  * Requires non-NULL locale ID but otherwise does the equivalent of

   563  * checking for language codes as if uloc_getLanguage() were called:

   564  * Accepts both 2- and 3-letter codes and accepts case variants.

   565  */

   566 U_CFUNC int32_t

   567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {

   568     int32_t result;

   569     char c;

   571     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {

   572         return result;

   573     }

   575     result=UCASE_LOC_ROOT;

   577     /*

   578      * This function used to use uloc_getLanguage(), but the current code

   579      * removes the dependency of this low-level code on uloc implementation code

   580      * and is faster because not the whole locale ID has to be

   581      * examined and copied/transformed.

   582      *

   583      * Because this code does not want to depend on uloc, the caller must

   584      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().

   585      */

   586     c=*locale++;

   587     if(is_t(c)) {

   588         /* tr or tur? */

   589         c=*locale++;

   590         if(is_u(c)) {

   591             c=*locale++;

   592         }

   593         if(is_r(c)) {

   594             c=*locale;

   595             if(is_sep(c)) {

   596                 result=UCASE_LOC_TURKISH;

   597             }

   598         }

   599     } else if(is_a(c)) {

   600         /* az or aze? */

   601         c=*locale++;

   602         if(is_z(c)) {

   603             c=*locale++;

   604             if(is_e(c)) {

   605                 c=*locale;

   606             }

   607             if(is_sep(c)) {

   608                 result=UCASE_LOC_TURKISH;

   609             }

   610         }

   611     } else if(is_l(c)) {

   612         /* lt or lit? */

   613         c=*locale++;

   614         if(is_i(c)) {

   615             c=*locale++;

   616         }

   617         if(is_t(c)) {

   618             c=*locale;

   619             if(is_sep(c)) {

   620                 result=UCASE_LOC_LITHUANIAN;

   621             }

   622         }

   623     } else if(is_n(c)) {

   624         /* nl or nld? */

   625         c=*locale++;

   626         if(is_l(c)) {

   627             c=*locale++;

   628             if(is_d(c)) {

   629                 c=*locale;

   630             }

   631             if(is_sep(c)) {

   632                 result=UCASE_LOC_DUTCH;

   633             }

   634         }

   635     }

   637     if(locCache!=NULL) {

   638         *locCache=result;

   639     }

   640     return result;

   641 }

   643 /*

   644  * Is followed by

   645  *   {case-ignorable}* cased

   646  * ?

   647  * (dir determines looking forward/backward)

   648  * If a character is case-ignorable, it is skipped regardless of whether

   649  * it is also cased or not.

   650  */

   651 static UBool

   652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {

   653     UChar32 c;

   655     if(iter==NULL) {

   656         return FALSE;

   657     }

   659     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {

   660         int32_t type=ucase_getTypeOrIgnorable(csp, c);

   661         if(type&4) {

   662             /* case-ignorable, continue with the loop */

   663         } else if(type!=UCASE_NONE) {

   664             return TRUE; /* followed by cased letter */

   665         } else {

   666             return FALSE; /* uncased and not case-ignorable */

   667         }

   668     }

   670     return FALSE; /* not followed by cased letter */

   671 }

   673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */

   674 static UBool

   675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {

   676     UChar32 c;

   677     int32_t dotType;

   678     int8_t dir;

   680     if(iter==NULL) {

   681         return FALSE;

   682     }

   684     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {

   685         dotType=getDotType(csp, c);

   686         if(dotType==UCASE_SOFT_DOTTED) {

   687             return TRUE; /* preceded by TYPE_i */

   688         } else if(dotType!=UCASE_OTHER_ACCENT) {

   689             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */

   690         }

   691     }

   693     return FALSE; /* not preceded by TYPE_i */

   694 }

   696 /*

   697  * See Jitterbug 2344:

   698  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above

   699  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because

   700  * we made those releases compatible with Unicode 3.2 which had not fixed

   701  * a related bug in SpecialCasing.txt.

   702  *

   703  * From the Jitterbug 2344 text:

   704  * ... this bug is listed as a Unicode erratum

   705  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html

   706  * <quote>

   707  * There are two errors in SpecialCasing.txt.

   708  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]

   709  * 2. An incorrect context definition. Correct as follows:

   710  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE

   711  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE

   712  * ---

   713  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE

   714  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE

   715  * where the context After_I is defined as:

   716  * The last preceding base character was an uppercase I, and there is no

   717  * intervening combining character class 230 (ABOVE).

   718  * </quote>

   719  *

   720  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:

   721  *

   722  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.

   723  * # This matches the behavior of the canonically equivalent I-dot_above

   724  *

   725  * See also the description in this place in older versions of uchar.c (revision 1.100).

   726  *

   727  * Markus W. Scherer 2003-feb-15

   728  */

   730 /* Is preceded by base character 'I' with no intervening cc=230 ? */

   731 static UBool

   732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {

   733     UChar32 c;

   734     int32_t dotType;

   735     int8_t dir;

   737     if(iter==NULL) {

   738         return FALSE;

   739     }

   741     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {

   742         if(c==0x49) {

   743             return TRUE; /* preceded by I */

   744         }

   745         dotType=getDotType(csp, c);

   746         if(dotType!=UCASE_OTHER_ACCENT) {

   747             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */

   748         }

   749     }

   751     return FALSE; /* not preceded by I */

   752 }

   754 /* Is followed by one or more cc==230 ? */

   755 static UBool

   756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {

   757     UChar32 c;

   758     int32_t dotType;

   759     int8_t dir;

   761     if(iter==NULL) {

   762         return FALSE;

   763     }

   765     for(dir=1; (c=iter(context, dir))>=0; dir=0) {

   766         dotType=getDotType(csp, c);

   767         if(dotType==UCASE_ABOVE) {

   768             return TRUE; /* at least one cc==230 following */

   769         } else if(dotType!=UCASE_OTHER_ACCENT) {

   770             return FALSE; /* next base character, no more cc==230 following */

   771         }

   772     }

   774     return FALSE; /* no more cc==230 following */

   775 }

   777 /* Is followed by a dot above (without cc==230 in between) ? */

   778 static UBool

   779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {

   780     UChar32 c;

   781     int32_t dotType;

   782     int8_t dir;

   784     if(iter==NULL) {

   785         return FALSE;

   786     }

   788     for(dir=1; (c=iter(context, dir))>=0; dir=0) {

   789         if(c==0x307) {

   790             return TRUE;

   791         }

   792         dotType=getDotType(csp, c);

   793         if(dotType!=UCASE_OTHER_ACCENT) {

   794             return FALSE; /* next base character or cc==230 in between */

   795         }

   796     }

   798     return FALSE; /* no dot above following */

   799 }

   801 U_CAPI int32_t U_EXPORT2

   802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,

   803                   UCaseContextIterator *iter, void *context,

   804                   const UChar **pString,

   805                   const char *locale, int32_t *locCache)

   806 {

   807     UChar32 result=c;

   808     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   809     if(!PROPS_HAS_EXCEPTION(props)) {

   810         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

   811             result=c+UCASE_GET_DELTA(props);

   812         }

   813     } else {

   814         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;

   815         uint16_t excWord=*pe++;

   816         int32_t full;

   818         pe2=pe;

   820         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {

   821             /* use hardcoded conditions and mappings */

   822             int32_t loc=ucase_getCaseLocale(locale, locCache);

   824             /*

   825              * Test for conditional mappings first

   826              *   (otherwise the unconditional default mappings are always taken),

   827              * then test for characters that have unconditional mappings in SpecialCasing.txt,

   828              * then get the UnicodeData.txt mappings.

   829              */

   830             if( loc==UCASE_LOC_LITHUANIAN &&

   831                     /* base characters, find accents above */

   832                     (((c==0x49 || c==0x4a || c==0x12e) &&

   833                         isFollowedByMoreAbove(csp, iter, context)) ||

   834                     /* precomposed with accent above, no need to find one */

   835                     (c==0xcc || c==0xcd || c==0x128))

   836             ) {

   837                 /*

   838                     # Lithuanian

   840                     # Lithuanian retains the dot in a lowercase i when followed by accents.

   842                     # Introduce an explicit dot above when lowercasing capital I's and J's

   843                     # whenever there are more accents above.

   844                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)

   846                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I

   847                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J

   848                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK

   849                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE

   850                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE

   851                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE

   852                  */

   853                 switch(c) {

   854                 case 0x49:  /* LATIN CAPITAL LETTER I */

   855                     *pString=iDot;

   856                     return 2;

   857                 case 0x4a:  /* LATIN CAPITAL LETTER J */

   858                     *pString=jDot;

   859                     return 2;

   860                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */

   861                     *pString=iOgonekDot;

   862                     return 2;

   863                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */

   864                     *pString=iDotGrave;

   865                     return 3;

   866                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */

   867                     *pString=iDotAcute;

   868                     return 3;

   869                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */

   870                     *pString=iDotTilde;

   871                     return 3;

   872                 default:

   873                     return 0; /* will not occur */

   874                 }

   875             /* # Turkish and Azeri */

   876             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {

   877                 /*

   878                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri

   879                     # The following rules handle those cases.

   881                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE

   882                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE

   883                  */

   884                 return 0x69;

   885             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {

   886                 /*

   887                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.

   888                     # This matches the behavior of the canonically equivalent I-dot_above

   890                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE

   891                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE

   892                  */

   893                 return 0; /* remove the dot (continue without output) */

   894             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {

   895                 /*

   896                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.

   898                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I

   899                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I

   900                  */

   901                 return 0x131;

   902             } else if(c==0x130) {

   903                 /*

   904                     # Preserve canonical equivalence for I with dot. Turkic is handled below.

   906                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE

   907                  */

   908                 *pString=iDot;

   909                 return 2;

   910             } else if(  c==0x3a3 &&

   911                         !isFollowedByCasedLetter(csp, iter, context, 1) &&

   912                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */

   913             ) {

   914                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */

   915                 /*

   916                     # Special case for final form of sigma

   918                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA

   919                  */

   920                 return 0x3c2; /* greek small final sigma */

   921             } else {

   922                 /* no known conditional special case mapping, use a normal mapping */

   923             }

   924         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

   925             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

   926             full&=UCASE_FULL_LOWER;

   927             if(full!=0) {

   928                 /* set the output pointer to the lowercase mapping */

   929                 *pString=reinterpret_cast<const UChar *>(pe+1);

   931                 /* return the string length */

   932                 return full;

   933             }

   934         }

   936         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

   937             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);

   938         }

   939     }

   941     return (result==c) ? ~result : result;

   942 }

   944 /* internal */

   945 static int32_t

   946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,

   947                UCaseContextIterator *iter, void *context,

   948                const UChar **pString,

   949                const char *locale, int32_t *locCache,

   950                UBool upperNotTitle) {

   951     UChar32 result=c;

   952     uint16_t props=UTRIE2_GET16(&csp->trie, c);

   953     if(!PROPS_HAS_EXCEPTION(props)) {

   954         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {

   955             result=c+UCASE_GET_DELTA(props);

   956         }

   957     } else {

   958         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;

   959         uint16_t excWord=*pe++;

   960         int32_t full, idx;

   962         pe2=pe;

   964         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {

   965             /* use hardcoded conditions and mappings */

   966             int32_t loc=ucase_getCaseLocale(locale, locCache);

   968             if(loc==UCASE_LOC_TURKISH && c==0x69) {

   969                 /*

   970                     # Turkish and Azeri

   972                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri

   973                     # The following rules handle those cases.

   975                     # When uppercasing, i turns into a dotted capital I

   977                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I

   978                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I

   979                 */

   980                 return 0x130;

   981             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {

   982                 /*

   983                     # Lithuanian

   985                     # Lithuanian retains the dot in a lowercase i when followed by accents.

   987                     # Remove DOT ABOVE after "i" with upper or titlecase

   989                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE

   990                  */

   991                 return 0; /* remove the dot (continue without output) */

   992             } else {

   993                 /* no known conditional special case mapping, use a normal mapping */

   994             }

   995         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

   996             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

   998             /* start of full case mapping strings */

   999             ++pe;

  1001             /* skip the lowercase and case-folding result strings */

  1002             pe+=full&UCASE_FULL_LOWER;

  1003             full>>=4;

  1004             pe+=full&0xf;

  1005             full>>=4;

  1007             if(upperNotTitle) {

  1008                 full&=0xf;

  1009             } else {

  1010                 /* skip the uppercase result string */

  1011                 pe+=full&0xf;

  1012                 full=(full>>4)&0xf;

  1013             }

  1015             if(full!=0) {

  1016                 /* set the output pointer to the result string */

  1017                 *pString=reinterpret_cast<const UChar *>(pe);

  1019                 /* return the string length */

  1020                 return full;

  1021             }

  1022         }

  1024         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {

  1025             idx=UCASE_EXC_TITLE;

  1026         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {

  1027             /* here, titlecase is same as uppercase */

  1028             idx=UCASE_EXC_UPPER;

  1029         } else {

  1030             return ~c;

  1031         }

  1032         GET_SLOT_VALUE(excWord, idx, pe2, result);

  1033     }

  1035     return (result==c) ? ~result : result;

  1036 }

  1038 U_CAPI int32_t U_EXPORT2

  1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,

  1040                   UCaseContextIterator *iter, void *context,

  1041                   const UChar **pString,

  1042                   const char *locale, int32_t *locCache) {

  1043     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);

  1044 }

  1046 U_CAPI int32_t U_EXPORT2

  1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,

  1048                   UCaseContextIterator *iter, void *context,

  1049                   const UChar **pString,

  1050                   const char *locale, int32_t *locCache) {

  1051     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);

  1052 }

  1054 /* case folding ------------------------------------------------------------- */

  1056 /*

  1057  * Case folding is similar to lowercasing.

  1058  * The result may be a simple mapping, i.e., a single code point, or

  1059  * a full mapping, i.e., a string.

  1060  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,

  1061  * then only the lowercase mapping is stored.

  1062  *

  1063  * Some special cases are hardcoded because their conditions cannot be

  1064  * parsed and processed from CaseFolding.txt.

  1065  *

  1066  * Unicode 3.2 CaseFolding.txt specifies for its status field:

  1068 # C: common case folding, common mappings shared by both simple and full mappings.

  1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.

  1070 # S: simple case folding, mappings to single characters where different from F.

  1071 # T: special case for uppercase I and dotted uppercase I

  1072 #    - For non-Turkic languages, this mapping is normally not used.

  1073 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.

  1074 #

  1075 # Usage:

  1076 #  A. To do a simple case folding, use the mappings with status C + S.

  1077 #  B. To do a full case folding, use the mappings with status C + F.

  1078 #

  1079 #    The mappings with status T can be used or omitted depending on the desired case-folding

  1080 #    behavior. (The default option is to exclude them.)

  1082  * Unicode 3.2 has 'T' mappings as follows:

  1084 0049; T; 0131; # LATIN CAPITAL LETTER I

  1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE

  1087  * while the default mappings for these code points are:

  1089 0049; C; 0069; # LATIN CAPITAL LETTER I

  1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE

  1092  * U+0130 has no simple case folding (simple-case-folds to itself).

  1093  */

  1095 /* return the simple case folding mapping for c */

  1096 U_CAPI UChar32 U_EXPORT2

  1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {

  1098     uint16_t props=UTRIE2_GET16(&csp->trie, c);

  1099     if(!PROPS_HAS_EXCEPTION(props)) {

  1100         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

  1101             c+=UCASE_GET_DELTA(props);

  1102         }

  1103     } else {

  1104         const uint16_t *pe=GET_EXCEPTIONS(csp, props);

  1105         uint16_t excWord=*pe++;

  1106         int32_t idx;

  1107         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {

  1108             /* special case folding mappings, hardcoded */

  1109             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {

  1110                 /* default mappings */

  1111                 if(c==0x49) {

  1112                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */

  1113                     return 0x69;

  1114                 } else if(c==0x130) {

  1115                     /* no simple case folding for U+0130 */

  1116                     return c;

  1117                 }

  1118             } else {

  1119                 /* Turkic mappings */

  1120                 if(c==0x49) {

  1121                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */

  1122                     return 0x131;

  1123                 } else if(c==0x130) {

  1124                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

  1125                     return 0x69;

  1126                 }

  1127             }

  1128         }

  1129         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {

  1130             idx=UCASE_EXC_FOLD;

  1131         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

  1132             idx=UCASE_EXC_LOWER;

  1133         } else {

  1134             return c;

  1135         }

  1136         GET_SLOT_VALUE(excWord, idx, pe, c);

  1137     }

  1138     return c;

  1139 }

  1141 /*

  1142  * Issue for canonical caseless match (UAX #21):

  1143  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve

  1144  * canonical equivalence, unlike default-option casefolding.

  1145  * For example, I-grave and I + grave fold to strings that are not canonically

  1146  * equivalent.

  1147  * For more details, see the comment in unorm_compare() in unorm.cpp

  1148  * and the intermediate prototype changes for Jitterbug 2021.

  1149  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)

  1150  *

  1151  * This did not get fixed because it appears that it is not possible to fix

  1152  * it for uppercase and lowercase characters (I-grave vs. i-grave)

  1153  * together in a way that they still fold to common result strings.

  1154  */

  1156 U_CAPI int32_t U_EXPORT2

  1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,

  1158                     const UChar **pString,

  1159                     uint32_t options)

  1160 {

  1161     UChar32 result=c;

  1162     uint16_t props=UTRIE2_GET16(&csp->trie, c);

  1163     if(!PROPS_HAS_EXCEPTION(props)) {

  1164         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {

  1165             result=c+UCASE_GET_DELTA(props);

  1166         }

  1167     } else {

  1168         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;

  1169         uint16_t excWord=*pe++;

  1170         int32_t full, idx;

  1172         pe2=pe;

  1174         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {

  1175             /* use hardcoded conditions and mappings */

  1176             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {

  1177                 /* default mappings */

  1178                 if(c==0x49) {

  1179                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */

  1180                     return 0x69;

  1181                 } else if(c==0x130) {

  1182                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

  1183                     *pString=iDot;

  1184                     return 2;

  1185                 }

  1186             } else {

  1187                 /* Turkic mappings */

  1188                 if(c==0x49) {

  1189                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */

  1190                     return 0x131;

  1191                 } else if(c==0x130) {

  1192                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

  1193                     return 0x69;

  1194                 }

  1195             }

  1196         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {

  1197             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);

  1199             /* start of full case mapping strings */

  1200             ++pe;

  1202             /* skip the lowercase result string */

  1203             pe+=full&UCASE_FULL_LOWER;

  1204             full=(full>>4)&0xf;

  1206             if(full!=0) {

  1207                 /* set the output pointer to the result string */

  1208                 *pString=reinterpret_cast<const UChar *>(pe);

  1210                 /* return the string length */

  1211                 return full;

  1212             }

  1213         }

  1215         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {

  1216             idx=UCASE_EXC_FOLD;

  1217         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

  1218             idx=UCASE_EXC_LOWER;

  1219         } else {

  1220             return ~c;

  1221         }

  1222         GET_SLOT_VALUE(excWord, idx, pe2, result);

  1223     }

  1225     return (result==c) ? ~result : result;

  1226 }

  1228 /* case mapping properties API ---------------------------------------------- */

  1230 #define GET_CASE_PROPS() &ucase_props_singleton

  1232 /* public API (see uchar.h) */

  1234 U_CAPI UBool U_EXPORT2

  1235 u_isULowercase(UChar32 c) {

  1236     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));

  1237 }

  1239 U_CAPI UBool U_EXPORT2

  1240 u_isUUppercase(UChar32 c) {

  1241     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));

  1242 }

  1244 /* Transforms the Unicode character to its lower case equivalent.*/

  1245 U_CAPI UChar32 U_EXPORT2

  1246 u_tolower(UChar32 c) {

  1247     return ucase_tolower(GET_CASE_PROPS(), c);

  1248 }

  1250 /* Transforms the Unicode character to its upper case equivalent.*/

  1251 U_CAPI UChar32 U_EXPORT2

  1252 u_toupper(UChar32 c) {

  1253     return ucase_toupper(GET_CASE_PROPS(), c);

  1254 }

  1256 /* Transforms the Unicode character to its title case equivalent.*/

  1257 U_CAPI UChar32 U_EXPORT2

  1258 u_totitle(UChar32 c) {

  1259     return ucase_totitle(GET_CASE_PROPS(), c);

  1260 }

  1262 /* return the simple case folding mapping for c */

  1263 U_CAPI UChar32 U_EXPORT2

  1264 u_foldCase(UChar32 c, uint32_t options) {

  1265     return ucase_fold(GET_CASE_PROPS(), c, options);

  1266 }

  1268 U_CFUNC int32_t U_EXPORT2

  1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {

  1270     /* case mapping properties */

  1271     const UChar *resultString;

  1272     int32_t locCache;

  1273     const UCaseProps *csp=GET_CASE_PROPS();

  1274     if(csp==NULL) {

  1275         return FALSE;

  1276     }

  1277     switch(which) {

  1278     case UCHAR_LOWERCASE:

  1279         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));

  1280     case UCHAR_UPPERCASE:

  1281         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));

  1282     case UCHAR_SOFT_DOTTED:

  1283         return ucase_isSoftDotted(csp, c);

  1284     case UCHAR_CASE_SENSITIVE:

  1285         return ucase_isCaseSensitive(csp, c);

  1286     case UCHAR_CASED:

  1287         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));

  1288     case UCHAR_CASE_IGNORABLE:

  1289         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);

  1290     /*

  1291      * Note: The following Changes_When_Xyz are defined as testing whether

  1292      * the NFD form of the input changes when Xyz-case-mapped.

  1293      * However, this simpler implementation of these properties,

  1294      * ignoring NFD, passes the tests.

  1295      * The implementation needs to be changed if the tests start failing.

  1296      * When that happens, optimizations should be used to work with the

  1297      * per-single-code point ucase_toFullXyz() functions unless

  1298      * the NFD form has more than one code point,

  1299      * and the property starts set needs to be the union of the

  1300      * start sets for normalization and case mappings.

  1301      */

  1302     case UCHAR_CHANGES_WHEN_LOWERCASED:

  1303         locCache=UCASE_LOC_ROOT;

  1304         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

  1305     case UCHAR_CHANGES_WHEN_UPPERCASED:

  1306         locCache=UCASE_LOC_ROOT;

  1307         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

  1308     case UCHAR_CHANGES_WHEN_TITLECASED:

  1309         locCache=UCASE_LOC_ROOT;

  1310         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

  1311     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */

  1312     case UCHAR_CHANGES_WHEN_CASEMAPPED:

  1313         locCache=UCASE_LOC_ROOT;

  1314         return (UBool)(

  1315             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||

  1316             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||

  1317             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);

  1318     default:

  1319         return FALSE;

  1320     }

  1321 }

The Tor Browser / file revision

intl/icu/source/common/ucase.cpp@b8a032363ba2

intl/icu/source/common/ucase.cpp