intl/icu/source/common/ucase.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2004-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucase.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004aug30
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * Low-level Unicode character/string case mapping code.
michael@0 17 * Much code moved here (and modified) from uchar.c.
michael@0 18 */
michael@0 19
michael@0 20 #include "unicode/utypes.h"
michael@0 21 #include "unicode/unistr.h"
michael@0 22 #include "unicode/uset.h"
michael@0 23 #include "unicode/udata.h" /* UDataInfo */
michael@0 24 #include "unicode/utf16.h"
michael@0 25 #include "ucmndata.h" /* DataHeader */
michael@0 26 #include "udatamem.h"
michael@0 27 #include "umutex.h"
michael@0 28 #include "uassert.h"
michael@0 29 #include "cmemory.h"
michael@0 30 #include "utrie2.h"
michael@0 31 #include "ucase.h"
michael@0 32 #include "ucln_cmn.h"
michael@0 33
michael@0 34 struct UCaseProps {
michael@0 35 UDataMemory *mem;
michael@0 36 const int32_t *indexes;
michael@0 37 const uint16_t *exceptions;
michael@0 38 const uint16_t *unfold;
michael@0 39
michael@0 40 UTrie2 trie;
michael@0 41 uint8_t formatVersion[4];
michael@0 42 };
michael@0 43
michael@0 44 /* ucase_props_data.h is machine-generated by gencase --csource */
michael@0 45 #define INCLUDED_FROM_UCASE_CPP
michael@0 46 #include "ucase_props_data.h"
michael@0 47
michael@0 48 /* UCaseProps singleton ----------------------------------------------------- */
michael@0 49
michael@0 50 U_CAPI const UCaseProps * U_EXPORT2
michael@0 51 ucase_getSingleton() {
michael@0 52 return &ucase_props_singleton;
michael@0 53 }
michael@0 54
michael@0 55 /* set of property starts for UnicodeSet ------------------------------------ */
michael@0 56
michael@0 57 static UBool U_CALLCONV
michael@0 58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
michael@0 59 /* add the start code point to the USet */
michael@0 60 const USetAdder *sa=(const USetAdder *)context;
michael@0 61 sa->add(sa->set, start);
michael@0 62 return TRUE;
michael@0 63 }
michael@0 64
michael@0 65 U_CFUNC void U_EXPORT2
michael@0 66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
michael@0 67 if(U_FAILURE(*pErrorCode)) {
michael@0 68 return;
michael@0 69 }
michael@0 70
michael@0 71 /* add the start code point of each same-value range of the trie */
michael@0 72 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
michael@0 73
michael@0 74 /* add code points with hardcoded properties, plus the ones following them */
michael@0 75
michael@0 76 /* (none right now, see comment below) */
michael@0 77
michael@0 78 /*
michael@0 79 * Omit code points with hardcoded specialcasing properties
michael@0 80 * because we do not build property UnicodeSets for them right now.
michael@0 81 */
michael@0 82 }
michael@0 83
michael@0 84 /* data access primitives --------------------------------------------------- */
michael@0 85
michael@0 86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
michael@0 87
michael@0 88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
michael@0 89
michael@0 90 /* number of bits in an 8-bit integer value */
michael@0 91 static const uint8_t flagsOffset[256]={
michael@0 92 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
michael@0 93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
michael@0 94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
michael@0 95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 96 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
michael@0 97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 99 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
michael@0 100 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
michael@0 101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 102 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
michael@0 104 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
michael@0 105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
michael@0 106 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
michael@0 107 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
michael@0 108 };
michael@0 109
michael@0 110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
michael@0 111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
michael@0 112
michael@0 113 /*
michael@0 114 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
michael@0 115 *
michael@0 116 * @param excWord (in) initial exceptions word
michael@0 117 * @param idx (in) desired slot index
michael@0 118 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
michael@0 119 * moved to the last uint16_t of the value, use +1 for beginning of next slot
michael@0 120 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
michael@0 121 */
michael@0 122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
michael@0 123 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
michael@0 124 (pExc16)+=SLOT_OFFSET(excWord, idx); \
michael@0 125 (value)=*pExc16; \
michael@0 126 } else { \
michael@0 127 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
michael@0 128 (value)=*pExc16++; \
michael@0 129 (value)=((value)<<16)|*pExc16; \
michael@0 130 }
michael@0 131
michael@0 132 /* simple case mappings ----------------------------------------------------- */
michael@0 133
michael@0 134 U_CAPI UChar32 U_EXPORT2
michael@0 135 ucase_tolower(const UCaseProps *csp, UChar32 c) {
michael@0 136 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 137 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 138 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
michael@0 139 c+=UCASE_GET_DELTA(props);
michael@0 140 }
michael@0 141 } else {
michael@0 142 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
michael@0 143 uint16_t excWord=*pe++;
michael@0 144 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
michael@0 145 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
michael@0 146 }
michael@0 147 }
michael@0 148 return c;
michael@0 149 }
michael@0 150
michael@0 151 U_CAPI UChar32 U_EXPORT2
michael@0 152 ucase_toupper(const UCaseProps *csp, UChar32 c) {
michael@0 153 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 154 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 155 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
michael@0 156 c+=UCASE_GET_DELTA(props);
michael@0 157 }
michael@0 158 } else {
michael@0 159 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
michael@0 160 uint16_t excWord=*pe++;
michael@0 161 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
michael@0 162 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
michael@0 163 }
michael@0 164 }
michael@0 165 return c;
michael@0 166 }
michael@0 167
michael@0 168 U_CAPI UChar32 U_EXPORT2
michael@0 169 ucase_totitle(const UCaseProps *csp, UChar32 c) {
michael@0 170 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 171 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 172 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
michael@0 173 c+=UCASE_GET_DELTA(props);
michael@0 174 }
michael@0 175 } else {
michael@0 176 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
michael@0 177 uint16_t excWord=*pe++;
michael@0 178 int32_t idx;
michael@0 179 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
michael@0 180 idx=UCASE_EXC_TITLE;
michael@0 181 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
michael@0 182 idx=UCASE_EXC_UPPER;
michael@0 183 } else {
michael@0 184 return c;
michael@0 185 }
michael@0 186 GET_SLOT_VALUE(excWord, idx, pe, c);
michael@0 187 }
michael@0 188 return c;
michael@0 189 }
michael@0 190
michael@0 191 static const UChar iDot[2] = { 0x69, 0x307 };
michael@0 192 static const UChar jDot[2] = { 0x6a, 0x307 };
michael@0 193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
michael@0 194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
michael@0 195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
michael@0 196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
michael@0 197
michael@0 198
michael@0 199 U_CFUNC void U_EXPORT2
michael@0 200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
michael@0 201 uint16_t props;
michael@0 202
michael@0 203 /*
michael@0 204 * Hardcode the case closure of i and its relatives and ignore the
michael@0 205 * data file data for these characters.
michael@0 206 * The Turkic dotless i and dotted I with their case mapping conditions
michael@0 207 * and case folding option make the related characters behave specially.
michael@0 208 * This code matches their closure behavior to their case folding behavior.
michael@0 209 */
michael@0 210
michael@0 211 switch(c) {
michael@0 212 case 0x49:
michael@0 213 /* regular i and I are in one equivalence class */
michael@0 214 sa->add(sa->set, 0x69);
michael@0 215 return;
michael@0 216 case 0x69:
michael@0 217 sa->add(sa->set, 0x49);
michael@0 218 return;
michael@0 219 case 0x130:
michael@0 220 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
michael@0 221 sa->addString(sa->set, iDot, 2);
michael@0 222 return;
michael@0 223 case 0x131:
michael@0 224 /* dotless i is in a class by itself */
michael@0 225 return;
michael@0 226 default:
michael@0 227 /* otherwise use the data file data */
michael@0 228 break;
michael@0 229 }
michael@0 230
michael@0 231 props=UTRIE2_GET16(&csp->trie, c);
michael@0 232 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 233 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
michael@0 234 /* add the one simple case mapping, no matter what type it is */
michael@0 235 int32_t delta=UCASE_GET_DELTA(props);
michael@0 236 if(delta!=0) {
michael@0 237 sa->add(sa->set, c+delta);
michael@0 238 }
michael@0 239 }
michael@0 240 } else {
michael@0 241 /*
michael@0 242 * c has exceptions, so there may be multiple simple and/or
michael@0 243 * full case mappings. Add them all.
michael@0 244 */
michael@0 245 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
michael@0 246 const UChar *closure;
michael@0 247 uint16_t excWord=*pe++;
michael@0 248 int32_t idx, closureLength, fullLength, length;
michael@0 249
michael@0 250 pe0=pe;
michael@0 251
michael@0 252 /* add all simple case mappings */
michael@0 253 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
michael@0 254 if(HAS_SLOT(excWord, idx)) {
michael@0 255 pe=pe0;
michael@0 256 GET_SLOT_VALUE(excWord, idx, pe, c);
michael@0 257 sa->add(sa->set, c);
michael@0 258 }
michael@0 259 }
michael@0 260
michael@0 261 /* get the closure string pointer & length */
michael@0 262 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
michael@0 263 pe=pe0;
michael@0 264 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
michael@0 265 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
michael@0 266 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
michael@0 267 } else {
michael@0 268 closureLength=0;
michael@0 269 closure=NULL;
michael@0 270 }
michael@0 271
michael@0 272 /* add the full case folding */
michael@0 273 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
michael@0 274 pe=pe0;
michael@0 275 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
michael@0 276
michael@0 277 /* start of full case mapping strings */
michael@0 278 ++pe;
michael@0 279
michael@0 280 fullLength&=0xffff; /* bits 16 and higher are reserved */
michael@0 281
michael@0 282 /* skip the lowercase result string */
michael@0 283 pe+=fullLength&UCASE_FULL_LOWER;
michael@0 284 fullLength>>=4;
michael@0 285
michael@0 286 /* add the full case folding string */
michael@0 287 length=fullLength&0xf;
michael@0 288 if(length!=0) {
michael@0 289 sa->addString(sa->set, (const UChar *)pe, length);
michael@0 290 pe+=length;
michael@0 291 }
michael@0 292
michael@0 293 /* skip the uppercase and titlecase strings */
michael@0 294 fullLength>>=4;
michael@0 295 pe+=fullLength&0xf;
michael@0 296 fullLength>>=4;
michael@0 297 pe+=fullLength;
michael@0 298
michael@0 299 closure=(const UChar *)pe; /* behind full case mappings */
michael@0 300 }
michael@0 301
michael@0 302 /* add each code point in the closure string */
michael@0 303 for(idx=0; idx<closureLength;) {
michael@0 304 U16_NEXT_UNSAFE(closure, idx, c);
michael@0 305 sa->add(sa->set, c);
michael@0 306 }
michael@0 307 }
michael@0 308 }
michael@0 309
michael@0 310 /*
michael@0 311 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
michael@0 312 * must be length>0 and max>0 and length<=max
michael@0 313 */
michael@0 314 static inline int32_t
michael@0 315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
michael@0 316 int32_t c1, c2;
michael@0 317
michael@0 318 max-=length; /* we require length<=max, so no need to decrement max in the loop */
michael@0 319 do {
michael@0 320 c1=*s++;
michael@0 321 c2=*t++;
michael@0 322 if(c2==0) {
michael@0 323 return 1; /* reached the end of t but not of s */
michael@0 324 }
michael@0 325 c1-=c2;
michael@0 326 if(c1!=0) {
michael@0 327 return c1; /* return difference result */
michael@0 328 }
michael@0 329 } while(--length>0);
michael@0 330 /* ends with length==0 */
michael@0 331
michael@0 332 if(max==0 || *t==0) {
michael@0 333 return 0; /* equal to length of both strings */
michael@0 334 } else {
michael@0 335 return -max; /* return lengh difference */
michael@0 336 }
michael@0 337 }
michael@0 338
michael@0 339 U_CFUNC UBool U_EXPORT2
michael@0 340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
michael@0 341 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
michael@0 342
michael@0 343 if(csp->unfold==NULL || s==NULL) {
michael@0 344 return FALSE; /* no reverse case folding data, or no string */
michael@0 345 }
michael@0 346 if(length<=1) {
michael@0 347 /* the string is too short to find any match */
michael@0 348 /*
michael@0 349 * more precise would be:
michael@0 350 * if(!u_strHasMoreChar32Than(s, length, 1))
michael@0 351 * but this does not make much practical difference because
michael@0 352 * a single supplementary code point would just not be found
michael@0 353 */
michael@0 354 return FALSE;
michael@0 355 }
michael@0 356
michael@0 357 const uint16_t *unfold=csp->unfold;
michael@0 358 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
michael@0 359 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
michael@0 360 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
michael@0 361 unfold+=unfoldRowWidth;
michael@0 362
michael@0 363 if(length>unfoldStringWidth) {
michael@0 364 /* the string is too long to find any match */
michael@0 365 return FALSE;
michael@0 366 }
michael@0 367
michael@0 368 /* do a binary search for the string */
michael@0 369 start=0;
michael@0 370 limit=unfoldRows;
michael@0 371 while(start<limit) {
michael@0 372 i=(start+limit)/2;
michael@0 373 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
michael@0 374 result=strcmpMax(s, length, p, unfoldStringWidth);
michael@0 375
michael@0 376 if(result==0) {
michael@0 377 /* found the string: add each code point, and its case closure */
michael@0 378 UChar32 c;
michael@0 379
michael@0 380 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
michael@0 381 U16_NEXT_UNSAFE(p, i, c);
michael@0 382 sa->add(sa->set, c);
michael@0 383 ucase_addCaseClosure(csp, c, sa);
michael@0 384 }
michael@0 385 return TRUE;
michael@0 386 } else if(result<0) {
michael@0 387 limit=i;
michael@0 388 } else /* result>0 */ {
michael@0 389 start=i+1;
michael@0 390 }
michael@0 391 }
michael@0 392
michael@0 393 return FALSE; /* string not found */
michael@0 394 }
michael@0 395
michael@0 396 U_NAMESPACE_BEGIN
michael@0 397
michael@0 398 FullCaseFoldingIterator::FullCaseFoldingIterator()
michael@0 399 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
michael@0 400 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
michael@0 401 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
michael@0 402 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
michael@0 403 currentRow(0),
michael@0 404 rowCpIndex(unfoldStringWidth) {
michael@0 405 unfold+=unfoldRowWidth;
michael@0 406 }
michael@0 407
michael@0 408 UChar32
michael@0 409 FullCaseFoldingIterator::next(UnicodeString &full) {
michael@0 410 // Advance past the last-delivered code point.
michael@0 411 const UChar *p=unfold+(currentRow*unfoldRowWidth);
michael@0 412 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
michael@0 413 ++currentRow;
michael@0 414 p+=unfoldRowWidth;
michael@0 415 rowCpIndex=unfoldStringWidth;
michael@0 416 }
michael@0 417 if(currentRow>=unfoldRows) { return U_SENTINEL; }
michael@0 418 // Set "full" to the NUL-terminated string in the first unfold column.
michael@0 419 int32_t length=unfoldStringWidth;
michael@0 420 while(length>0 && p[length-1]==0) { --length; }
michael@0 421 full.setTo(FALSE, p, length);
michael@0 422 // Return the code point.
michael@0 423 UChar32 c;
michael@0 424 U16_NEXT_UNSAFE(p, rowCpIndex, c);
michael@0 425 return c;
michael@0 426 }
michael@0 427
michael@0 428 U_NAMESPACE_END
michael@0 429
michael@0 430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
michael@0 431 U_CAPI int32_t U_EXPORT2
michael@0 432 ucase_getType(const UCaseProps *csp, UChar32 c) {
michael@0 433 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 434 return UCASE_GET_TYPE(props);
michael@0 435 }
michael@0 436
michael@0 437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
michael@0 438 U_CAPI int32_t U_EXPORT2
michael@0 439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
michael@0 440 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 441 return UCASE_GET_TYPE_AND_IGNORABLE(props);
michael@0 442 }
michael@0 443
michael@0 444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
michael@0 445 static inline int32_t
michael@0 446 getDotType(const UCaseProps *csp, UChar32 c) {
michael@0 447 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 448 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 449 return props&UCASE_DOT_MASK;
michael@0 450 } else {
michael@0 451 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
michael@0 452 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
michael@0 453 }
michael@0 454 }
michael@0 455
michael@0 456 U_CAPI UBool U_EXPORT2
michael@0 457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
michael@0 458 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
michael@0 459 }
michael@0 460
michael@0 461 U_CAPI UBool U_EXPORT2
michael@0 462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
michael@0 463 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 464 return (UBool)((props&UCASE_SENSITIVE)!=0);
michael@0 465 }
michael@0 466
michael@0 467 /* string casing ------------------------------------------------------------ */
michael@0 468
michael@0 469 /*
michael@0 470 * These internal functions form the core of string case mappings.
michael@0 471 * They map single code points to result code points or strings and take
michael@0 472 * all necessary conditions (context, locale ID, options) into account.
michael@0 473 *
michael@0 474 * They do not iterate over the source or write to the destination
michael@0 475 * so that the same functions are useful for non-standard string storage,
michael@0 476 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
michael@0 477 * For the same reason, the "surrounding text" context is passed in as a
michael@0 478 * UCaseContextIterator which does not make any assumptions about
michael@0 479 * the underlying storage.
michael@0 480 *
michael@0 481 * This section contains helper functions that check for conditions
michael@0 482 * in the input text surrounding the current code point
michael@0 483 * according to SpecialCasing.txt.
michael@0 484 *
michael@0 485 * Each helper function gets the index
michael@0 486 * - after the current code point if it looks at following text
michael@0 487 * - before the current code point if it looks at preceding text
michael@0 488 *
michael@0 489 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
michael@0 490 *
michael@0 491 * Final_Sigma
michael@0 492 * C is preceded by a sequence consisting of
michael@0 493 * a cased letter and a case-ignorable sequence,
michael@0 494 * and C is not followed by a sequence consisting of
michael@0 495 * an ignorable sequence and then a cased letter.
michael@0 496 *
michael@0 497 * More_Above
michael@0 498 * C is followed by one or more characters of combining class 230 (ABOVE)
michael@0 499 * in the combining character sequence.
michael@0 500 *
michael@0 501 * After_Soft_Dotted
michael@0 502 * The last preceding character with combining class of zero before C
michael@0 503 * was Soft_Dotted,
michael@0 504 * and there is no intervening combining character class 230 (ABOVE).
michael@0 505 *
michael@0 506 * Before_Dot
michael@0 507 * C is followed by combining dot above (U+0307).
michael@0 508 * Any sequence of characters with a combining class that is neither 0 nor 230
michael@0 509 * may intervene between the current character and the combining dot above.
michael@0 510 *
michael@0 511 * The erratum from 2002-10-31 adds the condition
michael@0 512 *
michael@0 513 * After_I
michael@0 514 * The last preceding base character was an uppercase I, and there is no
michael@0 515 * intervening combining character class 230 (ABOVE).
michael@0 516 *
michael@0 517 * (See Jitterbug 2344 and the comments on After_I below.)
michael@0 518 *
michael@0 519 * Helper definitions in Unicode 3.2 UAX 21:
michael@0 520 *
michael@0 521 * D1. A character C is defined to be cased
michael@0 522 * if it meets any of the following criteria:
michael@0 523 *
michael@0 524 * - The general category of C is Titlecase Letter (Lt)
michael@0 525 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
michael@0 526 * - Given D = NFD(C), then it is not the case that:
michael@0 527 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
michael@0 528 * (This third criterium does not add any characters to the list
michael@0 529 * for Unicode 3.2. Ignored.)
michael@0 530 *
michael@0 531 * D2. A character C is defined to be case-ignorable
michael@0 532 * if it meets either of the following criteria:
michael@0 533 *
michael@0 534 * - The general category of C is
michael@0 535 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
michael@0 536 * Letter Modifier (Lm), or Symbol Modifier (Sk)
michael@0 537 * - C is one of the following characters
michael@0 538 * U+0027 APOSTROPHE
michael@0 539 * U+00AD SOFT HYPHEN (SHY)
michael@0 540 * U+2019 RIGHT SINGLE QUOTATION MARK
michael@0 541 * (the preferred character for apostrophe)
michael@0 542 *
michael@0 543 * D3. A case-ignorable sequence is a sequence of
michael@0 544 * zero or more case-ignorable characters.
michael@0 545 */
michael@0 546
michael@0 547 #define is_a(c) ((c)=='a' || (c)=='A')
michael@0 548 #define is_d(c) ((c)=='d' || (c)=='D')
michael@0 549 #define is_e(c) ((c)=='e' || (c)=='E')
michael@0 550 #define is_i(c) ((c)=='i' || (c)=='I')
michael@0 551 #define is_l(c) ((c)=='l' || (c)=='L')
michael@0 552 #define is_n(c) ((c)=='n' || (c)=='N')
michael@0 553 #define is_r(c) ((c)=='r' || (c)=='R')
michael@0 554 #define is_t(c) ((c)=='t' || (c)=='T')
michael@0 555 #define is_u(c) ((c)=='u' || (c)=='U')
michael@0 556 #define is_z(c) ((c)=='z' || (c)=='Z')
michael@0 557
michael@0 558 /* separator? */
michael@0 559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
michael@0 560
michael@0 561 /**
michael@0 562 * Requires non-NULL locale ID but otherwise does the equivalent of
michael@0 563 * checking for language codes as if uloc_getLanguage() were called:
michael@0 564 * Accepts both 2- and 3-letter codes and accepts case variants.
michael@0 565 */
michael@0 566 U_CFUNC int32_t
michael@0 567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
michael@0 568 int32_t result;
michael@0 569 char c;
michael@0 570
michael@0 571 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
michael@0 572 return result;
michael@0 573 }
michael@0 574
michael@0 575 result=UCASE_LOC_ROOT;
michael@0 576
michael@0 577 /*
michael@0 578 * This function used to use uloc_getLanguage(), but the current code
michael@0 579 * removes the dependency of this low-level code on uloc implementation code
michael@0 580 * and is faster because not the whole locale ID has to be
michael@0 581 * examined and copied/transformed.
michael@0 582 *
michael@0 583 * Because this code does not want to depend on uloc, the caller must
michael@0 584 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
michael@0 585 */
michael@0 586 c=*locale++;
michael@0 587 if(is_t(c)) {
michael@0 588 /* tr or tur? */
michael@0 589 c=*locale++;
michael@0 590 if(is_u(c)) {
michael@0 591 c=*locale++;
michael@0 592 }
michael@0 593 if(is_r(c)) {
michael@0 594 c=*locale;
michael@0 595 if(is_sep(c)) {
michael@0 596 result=UCASE_LOC_TURKISH;
michael@0 597 }
michael@0 598 }
michael@0 599 } else if(is_a(c)) {
michael@0 600 /* az or aze? */
michael@0 601 c=*locale++;
michael@0 602 if(is_z(c)) {
michael@0 603 c=*locale++;
michael@0 604 if(is_e(c)) {
michael@0 605 c=*locale;
michael@0 606 }
michael@0 607 if(is_sep(c)) {
michael@0 608 result=UCASE_LOC_TURKISH;
michael@0 609 }
michael@0 610 }
michael@0 611 } else if(is_l(c)) {
michael@0 612 /* lt or lit? */
michael@0 613 c=*locale++;
michael@0 614 if(is_i(c)) {
michael@0 615 c=*locale++;
michael@0 616 }
michael@0 617 if(is_t(c)) {
michael@0 618 c=*locale;
michael@0 619 if(is_sep(c)) {
michael@0 620 result=UCASE_LOC_LITHUANIAN;
michael@0 621 }
michael@0 622 }
michael@0 623 } else if(is_n(c)) {
michael@0 624 /* nl or nld? */
michael@0 625 c=*locale++;
michael@0 626 if(is_l(c)) {
michael@0 627 c=*locale++;
michael@0 628 if(is_d(c)) {
michael@0 629 c=*locale;
michael@0 630 }
michael@0 631 if(is_sep(c)) {
michael@0 632 result=UCASE_LOC_DUTCH;
michael@0 633 }
michael@0 634 }
michael@0 635 }
michael@0 636
michael@0 637 if(locCache!=NULL) {
michael@0 638 *locCache=result;
michael@0 639 }
michael@0 640 return result;
michael@0 641 }
michael@0 642
michael@0 643 /*
michael@0 644 * Is followed by
michael@0 645 * {case-ignorable}* cased
michael@0 646 * ?
michael@0 647 * (dir determines looking forward/backward)
michael@0 648 * If a character is case-ignorable, it is skipped regardless of whether
michael@0 649 * it is also cased or not.
michael@0 650 */
michael@0 651 static UBool
michael@0 652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
michael@0 653 UChar32 c;
michael@0 654
michael@0 655 if(iter==NULL) {
michael@0 656 return FALSE;
michael@0 657 }
michael@0 658
michael@0 659 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
michael@0 660 int32_t type=ucase_getTypeOrIgnorable(csp, c);
michael@0 661 if(type&4) {
michael@0 662 /* case-ignorable, continue with the loop */
michael@0 663 } else if(type!=UCASE_NONE) {
michael@0 664 return TRUE; /* followed by cased letter */
michael@0 665 } else {
michael@0 666 return FALSE; /* uncased and not case-ignorable */
michael@0 667 }
michael@0 668 }
michael@0 669
michael@0 670 return FALSE; /* not followed by cased letter */
michael@0 671 }
michael@0 672
michael@0 673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
michael@0 674 static UBool
michael@0 675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
michael@0 676 UChar32 c;
michael@0 677 int32_t dotType;
michael@0 678 int8_t dir;
michael@0 679
michael@0 680 if(iter==NULL) {
michael@0 681 return FALSE;
michael@0 682 }
michael@0 683
michael@0 684 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
michael@0 685 dotType=getDotType(csp, c);
michael@0 686 if(dotType==UCASE_SOFT_DOTTED) {
michael@0 687 return TRUE; /* preceded by TYPE_i */
michael@0 688 } else if(dotType!=UCASE_OTHER_ACCENT) {
michael@0 689 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
michael@0 690 }
michael@0 691 }
michael@0 692
michael@0 693 return FALSE; /* not preceded by TYPE_i */
michael@0 694 }
michael@0 695
michael@0 696 /*
michael@0 697 * See Jitterbug 2344:
michael@0 698 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
michael@0 699 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
michael@0 700 * we made those releases compatible with Unicode 3.2 which had not fixed
michael@0 701 * a related bug in SpecialCasing.txt.
michael@0 702 *
michael@0 703 * From the Jitterbug 2344 text:
michael@0 704 * ... this bug is listed as a Unicode erratum
michael@0 705 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
michael@0 706 * <quote>
michael@0 707 * There are two errors in SpecialCasing.txt.
michael@0 708 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
michael@0 709 * 2. An incorrect context definition. Correct as follows:
michael@0 710 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
michael@0 711 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
michael@0 712 * ---
michael@0 713 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
michael@0 714 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
michael@0 715 * where the context After_I is defined as:
michael@0 716 * The last preceding base character was an uppercase I, and there is no
michael@0 717 * intervening combining character class 230 (ABOVE).
michael@0 718 * </quote>
michael@0 719 *
michael@0 720 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
michael@0 721 *
michael@0 722 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
michael@0 723 * # This matches the behavior of the canonically equivalent I-dot_above
michael@0 724 *
michael@0 725 * See also the description in this place in older versions of uchar.c (revision 1.100).
michael@0 726 *
michael@0 727 * Markus W. Scherer 2003-feb-15
michael@0 728 */
michael@0 729
michael@0 730 /* Is preceded by base character 'I' with no intervening cc=230 ? */
michael@0 731 static UBool
michael@0 732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
michael@0 733 UChar32 c;
michael@0 734 int32_t dotType;
michael@0 735 int8_t dir;
michael@0 736
michael@0 737 if(iter==NULL) {
michael@0 738 return FALSE;
michael@0 739 }
michael@0 740
michael@0 741 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
michael@0 742 if(c==0x49) {
michael@0 743 return TRUE; /* preceded by I */
michael@0 744 }
michael@0 745 dotType=getDotType(csp, c);
michael@0 746 if(dotType!=UCASE_OTHER_ACCENT) {
michael@0 747 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
michael@0 748 }
michael@0 749 }
michael@0 750
michael@0 751 return FALSE; /* not preceded by I */
michael@0 752 }
michael@0 753
michael@0 754 /* Is followed by one or more cc==230 ? */
michael@0 755 static UBool
michael@0 756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
michael@0 757 UChar32 c;
michael@0 758 int32_t dotType;
michael@0 759 int8_t dir;
michael@0 760
michael@0 761 if(iter==NULL) {
michael@0 762 return FALSE;
michael@0 763 }
michael@0 764
michael@0 765 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
michael@0 766 dotType=getDotType(csp, c);
michael@0 767 if(dotType==UCASE_ABOVE) {
michael@0 768 return TRUE; /* at least one cc==230 following */
michael@0 769 } else if(dotType!=UCASE_OTHER_ACCENT) {
michael@0 770 return FALSE; /* next base character, no more cc==230 following */
michael@0 771 }
michael@0 772 }
michael@0 773
michael@0 774 return FALSE; /* no more cc==230 following */
michael@0 775 }
michael@0 776
michael@0 777 /* Is followed by a dot above (without cc==230 in between) ? */
michael@0 778 static UBool
michael@0 779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
michael@0 780 UChar32 c;
michael@0 781 int32_t dotType;
michael@0 782 int8_t dir;
michael@0 783
michael@0 784 if(iter==NULL) {
michael@0 785 return FALSE;
michael@0 786 }
michael@0 787
michael@0 788 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
michael@0 789 if(c==0x307) {
michael@0 790 return TRUE;
michael@0 791 }
michael@0 792 dotType=getDotType(csp, c);
michael@0 793 if(dotType!=UCASE_OTHER_ACCENT) {
michael@0 794 return FALSE; /* next base character or cc==230 in between */
michael@0 795 }
michael@0 796 }
michael@0 797
michael@0 798 return FALSE; /* no dot above following */
michael@0 799 }
michael@0 800
michael@0 801 U_CAPI int32_t U_EXPORT2
michael@0 802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
michael@0 803 UCaseContextIterator *iter, void *context,
michael@0 804 const UChar **pString,
michael@0 805 const char *locale, int32_t *locCache)
michael@0 806 {
michael@0 807 UChar32 result=c;
michael@0 808 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 809 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 810 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
michael@0 811 result=c+UCASE_GET_DELTA(props);
michael@0 812 }
michael@0 813 } else {
michael@0 814 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
michael@0 815 uint16_t excWord=*pe++;
michael@0 816 int32_t full;
michael@0 817
michael@0 818 pe2=pe;
michael@0 819
michael@0 820 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
michael@0 821 /* use hardcoded conditions and mappings */
michael@0 822 int32_t loc=ucase_getCaseLocale(locale, locCache);
michael@0 823
michael@0 824 /*
michael@0 825 * Test for conditional mappings first
michael@0 826 * (otherwise the unconditional default mappings are always taken),
michael@0 827 * then test for characters that have unconditional mappings in SpecialCasing.txt,
michael@0 828 * then get the UnicodeData.txt mappings.
michael@0 829 */
michael@0 830 if( loc==UCASE_LOC_LITHUANIAN &&
michael@0 831 /* base characters, find accents above */
michael@0 832 (((c==0x49 || c==0x4a || c==0x12e) &&
michael@0 833 isFollowedByMoreAbove(csp, iter, context)) ||
michael@0 834 /* precomposed with accent above, no need to find one */
michael@0 835 (c==0xcc || c==0xcd || c==0x128))
michael@0 836 ) {
michael@0 837 /*
michael@0 838 # Lithuanian
michael@0 839
michael@0 840 # Lithuanian retains the dot in a lowercase i when followed by accents.
michael@0 841
michael@0 842 # Introduce an explicit dot above when lowercasing capital I's and J's
michael@0 843 # whenever there are more accents above.
michael@0 844 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
michael@0 845
michael@0 846 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
michael@0 847 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
michael@0 848 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
michael@0 849 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
michael@0 850 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
michael@0 851 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
michael@0 852 */
michael@0 853 switch(c) {
michael@0 854 case 0x49: /* LATIN CAPITAL LETTER I */
michael@0 855 *pString=iDot;
michael@0 856 return 2;
michael@0 857 case 0x4a: /* LATIN CAPITAL LETTER J */
michael@0 858 *pString=jDot;
michael@0 859 return 2;
michael@0 860 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
michael@0 861 *pString=iOgonekDot;
michael@0 862 return 2;
michael@0 863 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
michael@0 864 *pString=iDotGrave;
michael@0 865 return 3;
michael@0 866 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
michael@0 867 *pString=iDotAcute;
michael@0 868 return 3;
michael@0 869 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
michael@0 870 *pString=iDotTilde;
michael@0 871 return 3;
michael@0 872 default:
michael@0 873 return 0; /* will not occur */
michael@0 874 }
michael@0 875 /* # Turkish and Azeri */
michael@0 876 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
michael@0 877 /*
michael@0 878 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
michael@0 879 # The following rules handle those cases.
michael@0 880
michael@0 881 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
michael@0 882 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
michael@0 883 */
michael@0 884 return 0x69;
michael@0 885 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
michael@0 886 /*
michael@0 887 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
michael@0 888 # This matches the behavior of the canonically equivalent I-dot_above
michael@0 889
michael@0 890 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
michael@0 891 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
michael@0 892 */
michael@0 893 return 0; /* remove the dot (continue without output) */
michael@0 894 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
michael@0 895 /*
michael@0 896 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
michael@0 897
michael@0 898 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
michael@0 899 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
michael@0 900 */
michael@0 901 return 0x131;
michael@0 902 } else if(c==0x130) {
michael@0 903 /*
michael@0 904 # Preserve canonical equivalence for I with dot. Turkic is handled below.
michael@0 905
michael@0 906 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
michael@0 907 */
michael@0 908 *pString=iDot;
michael@0 909 return 2;
michael@0 910 } else if( c==0x3a3 &&
michael@0 911 !isFollowedByCasedLetter(csp, iter, context, 1) &&
michael@0 912 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
michael@0 913 ) {
michael@0 914 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
michael@0 915 /*
michael@0 916 # Special case for final form of sigma
michael@0 917
michael@0 918 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
michael@0 919 */
michael@0 920 return 0x3c2; /* greek small final sigma */
michael@0 921 } else {
michael@0 922 /* no known conditional special case mapping, use a normal mapping */
michael@0 923 }
michael@0 924 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
michael@0 925 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
michael@0 926 full&=UCASE_FULL_LOWER;
michael@0 927 if(full!=0) {
michael@0 928 /* set the output pointer to the lowercase mapping */
michael@0 929 *pString=reinterpret_cast<const UChar *>(pe+1);
michael@0 930
michael@0 931 /* return the string length */
michael@0 932 return full;
michael@0 933 }
michael@0 934 }
michael@0 935
michael@0 936 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
michael@0 937 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
michael@0 938 }
michael@0 939 }
michael@0 940
michael@0 941 return (result==c) ? ~result : result;
michael@0 942 }
michael@0 943
michael@0 944 /* internal */
michael@0 945 static int32_t
michael@0 946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
michael@0 947 UCaseContextIterator *iter, void *context,
michael@0 948 const UChar **pString,
michael@0 949 const char *locale, int32_t *locCache,
michael@0 950 UBool upperNotTitle) {
michael@0 951 UChar32 result=c;
michael@0 952 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 953 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 954 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
michael@0 955 result=c+UCASE_GET_DELTA(props);
michael@0 956 }
michael@0 957 } else {
michael@0 958 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
michael@0 959 uint16_t excWord=*pe++;
michael@0 960 int32_t full, idx;
michael@0 961
michael@0 962 pe2=pe;
michael@0 963
michael@0 964 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
michael@0 965 /* use hardcoded conditions and mappings */
michael@0 966 int32_t loc=ucase_getCaseLocale(locale, locCache);
michael@0 967
michael@0 968 if(loc==UCASE_LOC_TURKISH && c==0x69) {
michael@0 969 /*
michael@0 970 # Turkish and Azeri
michael@0 971
michael@0 972 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
michael@0 973 # The following rules handle those cases.
michael@0 974
michael@0 975 # When uppercasing, i turns into a dotted capital I
michael@0 976
michael@0 977 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
michael@0 978 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
michael@0 979 */
michael@0 980 return 0x130;
michael@0 981 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
michael@0 982 /*
michael@0 983 # Lithuanian
michael@0 984
michael@0 985 # Lithuanian retains the dot in a lowercase i when followed by accents.
michael@0 986
michael@0 987 # Remove DOT ABOVE after "i" with upper or titlecase
michael@0 988
michael@0 989 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
michael@0 990 */
michael@0 991 return 0; /* remove the dot (continue without output) */
michael@0 992 } else {
michael@0 993 /* no known conditional special case mapping, use a normal mapping */
michael@0 994 }
michael@0 995 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
michael@0 996 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
michael@0 997
michael@0 998 /* start of full case mapping strings */
michael@0 999 ++pe;
michael@0 1000
michael@0 1001 /* skip the lowercase and case-folding result strings */
michael@0 1002 pe+=full&UCASE_FULL_LOWER;
michael@0 1003 full>>=4;
michael@0 1004 pe+=full&0xf;
michael@0 1005 full>>=4;
michael@0 1006
michael@0 1007 if(upperNotTitle) {
michael@0 1008 full&=0xf;
michael@0 1009 } else {
michael@0 1010 /* skip the uppercase result string */
michael@0 1011 pe+=full&0xf;
michael@0 1012 full=(full>>4)&0xf;
michael@0 1013 }
michael@0 1014
michael@0 1015 if(full!=0) {
michael@0 1016 /* set the output pointer to the result string */
michael@0 1017 *pString=reinterpret_cast<const UChar *>(pe);
michael@0 1018
michael@0 1019 /* return the string length */
michael@0 1020 return full;
michael@0 1021 }
michael@0 1022 }
michael@0 1023
michael@0 1024 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
michael@0 1025 idx=UCASE_EXC_TITLE;
michael@0 1026 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
michael@0 1027 /* here, titlecase is same as uppercase */
michael@0 1028 idx=UCASE_EXC_UPPER;
michael@0 1029 } else {
michael@0 1030 return ~c;
michael@0 1031 }
michael@0 1032 GET_SLOT_VALUE(excWord, idx, pe2, result);
michael@0 1033 }
michael@0 1034
michael@0 1035 return (result==c) ? ~result : result;
michael@0 1036 }
michael@0 1037
michael@0 1038 U_CAPI int32_t U_EXPORT2
michael@0 1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
michael@0 1040 UCaseContextIterator *iter, void *context,
michael@0 1041 const UChar **pString,
michael@0 1042 const char *locale, int32_t *locCache) {
michael@0 1043 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
michael@0 1044 }
michael@0 1045
michael@0 1046 U_CAPI int32_t U_EXPORT2
michael@0 1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
michael@0 1048 UCaseContextIterator *iter, void *context,
michael@0 1049 const UChar **pString,
michael@0 1050 const char *locale, int32_t *locCache) {
michael@0 1051 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
michael@0 1052 }
michael@0 1053
michael@0 1054 /* case folding ------------------------------------------------------------- */
michael@0 1055
michael@0 1056 /*
michael@0 1057 * Case folding is similar to lowercasing.
michael@0 1058 * The result may be a simple mapping, i.e., a single code point, or
michael@0 1059 * a full mapping, i.e., a string.
michael@0 1060 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
michael@0 1061 * then only the lowercase mapping is stored.
michael@0 1062 *
michael@0 1063 * Some special cases are hardcoded because their conditions cannot be
michael@0 1064 * parsed and processed from CaseFolding.txt.
michael@0 1065 *
michael@0 1066 * Unicode 3.2 CaseFolding.txt specifies for its status field:
michael@0 1067
michael@0 1068 # C: common case folding, common mappings shared by both simple and full mappings.
michael@0 1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
michael@0 1070 # S: simple case folding, mappings to single characters where different from F.
michael@0 1071 # T: special case for uppercase I and dotted uppercase I
michael@0 1072 # - For non-Turkic languages, this mapping is normally not used.
michael@0 1073 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
michael@0 1074 #
michael@0 1075 # Usage:
michael@0 1076 # A. To do a simple case folding, use the mappings with status C + S.
michael@0 1077 # B. To do a full case folding, use the mappings with status C + F.
michael@0 1078 #
michael@0 1079 # The mappings with status T can be used or omitted depending on the desired case-folding
michael@0 1080 # behavior. (The default option is to exclude them.)
michael@0 1081
michael@0 1082 * Unicode 3.2 has 'T' mappings as follows:
michael@0 1083
michael@0 1084 0049; T; 0131; # LATIN CAPITAL LETTER I
michael@0 1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
michael@0 1086
michael@0 1087 * while the default mappings for these code points are:
michael@0 1088
michael@0 1089 0049; C; 0069; # LATIN CAPITAL LETTER I
michael@0 1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
michael@0 1091
michael@0 1092 * U+0130 has no simple case folding (simple-case-folds to itself).
michael@0 1093 */
michael@0 1094
michael@0 1095 /* return the simple case folding mapping for c */
michael@0 1096 U_CAPI UChar32 U_EXPORT2
michael@0 1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
michael@0 1098 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 1099 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 1100 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
michael@0 1101 c+=UCASE_GET_DELTA(props);
michael@0 1102 }
michael@0 1103 } else {
michael@0 1104 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
michael@0 1105 uint16_t excWord=*pe++;
michael@0 1106 int32_t idx;
michael@0 1107 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
michael@0 1108 /* special case folding mappings, hardcoded */
michael@0 1109 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
michael@0 1110 /* default mappings */
michael@0 1111 if(c==0x49) {
michael@0 1112 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
michael@0 1113 return 0x69;
michael@0 1114 } else if(c==0x130) {
michael@0 1115 /* no simple case folding for U+0130 */
michael@0 1116 return c;
michael@0 1117 }
michael@0 1118 } else {
michael@0 1119 /* Turkic mappings */
michael@0 1120 if(c==0x49) {
michael@0 1121 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
michael@0 1122 return 0x131;
michael@0 1123 } else if(c==0x130) {
michael@0 1124 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
michael@0 1125 return 0x69;
michael@0 1126 }
michael@0 1127 }
michael@0 1128 }
michael@0 1129 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
michael@0 1130 idx=UCASE_EXC_FOLD;
michael@0 1131 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
michael@0 1132 idx=UCASE_EXC_LOWER;
michael@0 1133 } else {
michael@0 1134 return c;
michael@0 1135 }
michael@0 1136 GET_SLOT_VALUE(excWord, idx, pe, c);
michael@0 1137 }
michael@0 1138 return c;
michael@0 1139 }
michael@0 1140
michael@0 1141 /*
michael@0 1142 * Issue for canonical caseless match (UAX #21):
michael@0 1143 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
michael@0 1144 * canonical equivalence, unlike default-option casefolding.
michael@0 1145 * For example, I-grave and I + grave fold to strings that are not canonically
michael@0 1146 * equivalent.
michael@0 1147 * For more details, see the comment in unorm_compare() in unorm.cpp
michael@0 1148 * and the intermediate prototype changes for Jitterbug 2021.
michael@0 1149 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
michael@0 1150 *
michael@0 1151 * This did not get fixed because it appears that it is not possible to fix
michael@0 1152 * it for uppercase and lowercase characters (I-grave vs. i-grave)
michael@0 1153 * together in a way that they still fold to common result strings.
michael@0 1154 */
michael@0 1155
michael@0 1156 U_CAPI int32_t U_EXPORT2
michael@0 1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
michael@0 1158 const UChar **pString,
michael@0 1159 uint32_t options)
michael@0 1160 {
michael@0 1161 UChar32 result=c;
michael@0 1162 uint16_t props=UTRIE2_GET16(&csp->trie, c);
michael@0 1163 if(!PROPS_HAS_EXCEPTION(props)) {
michael@0 1164 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
michael@0 1165 result=c+UCASE_GET_DELTA(props);
michael@0 1166 }
michael@0 1167 } else {
michael@0 1168 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
michael@0 1169 uint16_t excWord=*pe++;
michael@0 1170 int32_t full, idx;
michael@0 1171
michael@0 1172 pe2=pe;
michael@0 1173
michael@0 1174 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
michael@0 1175 /* use hardcoded conditions and mappings */
michael@0 1176 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
michael@0 1177 /* default mappings */
michael@0 1178 if(c==0x49) {
michael@0 1179 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
michael@0 1180 return 0x69;
michael@0 1181 } else if(c==0x130) {
michael@0 1182 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
michael@0 1183 *pString=iDot;
michael@0 1184 return 2;
michael@0 1185 }
michael@0 1186 } else {
michael@0 1187 /* Turkic mappings */
michael@0 1188 if(c==0x49) {
michael@0 1189 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
michael@0 1190 return 0x131;
michael@0 1191 } else if(c==0x130) {
michael@0 1192 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
michael@0 1193 return 0x69;
michael@0 1194 }
michael@0 1195 }
michael@0 1196 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
michael@0 1197 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
michael@0 1198
michael@0 1199 /* start of full case mapping strings */
michael@0 1200 ++pe;
michael@0 1201
michael@0 1202 /* skip the lowercase result string */
michael@0 1203 pe+=full&UCASE_FULL_LOWER;
michael@0 1204 full=(full>>4)&0xf;
michael@0 1205
michael@0 1206 if(full!=0) {
michael@0 1207 /* set the output pointer to the result string */
michael@0 1208 *pString=reinterpret_cast<const UChar *>(pe);
michael@0 1209
michael@0 1210 /* return the string length */
michael@0 1211 return full;
michael@0 1212 }
michael@0 1213 }
michael@0 1214
michael@0 1215 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
michael@0 1216 idx=UCASE_EXC_FOLD;
michael@0 1217 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
michael@0 1218 idx=UCASE_EXC_LOWER;
michael@0 1219 } else {
michael@0 1220 return ~c;
michael@0 1221 }
michael@0 1222 GET_SLOT_VALUE(excWord, idx, pe2, result);
michael@0 1223 }
michael@0 1224
michael@0 1225 return (result==c) ? ~result : result;
michael@0 1226 }
michael@0 1227
michael@0 1228 /* case mapping properties API ---------------------------------------------- */
michael@0 1229
michael@0 1230 #define GET_CASE_PROPS() &ucase_props_singleton
michael@0 1231
michael@0 1232 /* public API (see uchar.h) */
michael@0 1233
michael@0 1234 U_CAPI UBool U_EXPORT2
michael@0 1235 u_isULowercase(UChar32 c) {
michael@0 1236 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
michael@0 1237 }
michael@0 1238
michael@0 1239 U_CAPI UBool U_EXPORT2
michael@0 1240 u_isUUppercase(UChar32 c) {
michael@0 1241 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
michael@0 1242 }
michael@0 1243
michael@0 1244 /* Transforms the Unicode character to its lower case equivalent.*/
michael@0 1245 U_CAPI UChar32 U_EXPORT2
michael@0 1246 u_tolower(UChar32 c) {
michael@0 1247 return ucase_tolower(GET_CASE_PROPS(), c);
michael@0 1248 }
michael@0 1249
michael@0 1250 /* Transforms the Unicode character to its upper case equivalent.*/
michael@0 1251 U_CAPI UChar32 U_EXPORT2
michael@0 1252 u_toupper(UChar32 c) {
michael@0 1253 return ucase_toupper(GET_CASE_PROPS(), c);
michael@0 1254 }
michael@0 1255
michael@0 1256 /* Transforms the Unicode character to its title case equivalent.*/
michael@0 1257 U_CAPI UChar32 U_EXPORT2
michael@0 1258 u_totitle(UChar32 c) {
michael@0 1259 return ucase_totitle(GET_CASE_PROPS(), c);
michael@0 1260 }
michael@0 1261
michael@0 1262 /* return the simple case folding mapping for c */
michael@0 1263 U_CAPI UChar32 U_EXPORT2
michael@0 1264 u_foldCase(UChar32 c, uint32_t options) {
michael@0 1265 return ucase_fold(GET_CASE_PROPS(), c, options);
michael@0 1266 }
michael@0 1267
michael@0 1268 U_CFUNC int32_t U_EXPORT2
michael@0 1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
michael@0 1270 /* case mapping properties */
michael@0 1271 const UChar *resultString;
michael@0 1272 int32_t locCache;
michael@0 1273 const UCaseProps *csp=GET_CASE_PROPS();
michael@0 1274 if(csp==NULL) {
michael@0 1275 return FALSE;
michael@0 1276 }
michael@0 1277 switch(which) {
michael@0 1278 case UCHAR_LOWERCASE:
michael@0 1279 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
michael@0 1280 case UCHAR_UPPERCASE:
michael@0 1281 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
michael@0 1282 case UCHAR_SOFT_DOTTED:
michael@0 1283 return ucase_isSoftDotted(csp, c);
michael@0 1284 case UCHAR_CASE_SENSITIVE:
michael@0 1285 return ucase_isCaseSensitive(csp, c);
michael@0 1286 case UCHAR_CASED:
michael@0 1287 return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
michael@0 1288 case UCHAR_CASE_IGNORABLE:
michael@0 1289 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
michael@0 1290 /*
michael@0 1291 * Note: The following Changes_When_Xyz are defined as testing whether
michael@0 1292 * the NFD form of the input changes when Xyz-case-mapped.
michael@0 1293 * However, this simpler implementation of these properties,
michael@0 1294 * ignoring NFD, passes the tests.
michael@0 1295 * The implementation needs to be changed if the tests start failing.
michael@0 1296 * When that happens, optimizations should be used to work with the
michael@0 1297 * per-single-code point ucase_toFullXyz() functions unless
michael@0 1298 * the NFD form has more than one code point,
michael@0 1299 * and the property starts set needs to be the union of the
michael@0 1300 * start sets for normalization and case mappings.
michael@0 1301 */
michael@0 1302 case UCHAR_CHANGES_WHEN_LOWERCASED:
michael@0 1303 locCache=UCASE_LOC_ROOT;
michael@0 1304 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
michael@0 1305 case UCHAR_CHANGES_WHEN_UPPERCASED:
michael@0 1306 locCache=UCASE_LOC_ROOT;
michael@0 1307 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
michael@0 1308 case UCHAR_CHANGES_WHEN_TITLECASED:
michael@0 1309 locCache=UCASE_LOC_ROOT;
michael@0 1310 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
michael@0 1311 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
michael@0 1312 case UCHAR_CHANGES_WHEN_CASEMAPPED:
michael@0 1313 locCache=UCASE_LOC_ROOT;
michael@0 1314 return (UBool)(
michael@0 1315 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
michael@0 1316 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
michael@0 1317 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
michael@0 1318 default:
michael@0 1319 return FALSE;
michael@0 1320 }
michael@0 1321 }

mercurial