|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2004-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: ucase.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2004aug30 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Low-level Unicode character/string case mapping code. |
|
17 * Much code moved here (and modified) from uchar.c. |
|
18 */ |
|
19 |
|
20 #include "unicode/utypes.h" |
|
21 #include "unicode/unistr.h" |
|
22 #include "unicode/uset.h" |
|
23 #include "unicode/udata.h" /* UDataInfo */ |
|
24 #include "unicode/utf16.h" |
|
25 #include "ucmndata.h" /* DataHeader */ |
|
26 #include "udatamem.h" |
|
27 #include "umutex.h" |
|
28 #include "uassert.h" |
|
29 #include "cmemory.h" |
|
30 #include "utrie2.h" |
|
31 #include "ucase.h" |
|
32 #include "ucln_cmn.h" |
|
33 |
|
34 struct UCaseProps { |
|
35 UDataMemory *mem; |
|
36 const int32_t *indexes; |
|
37 const uint16_t *exceptions; |
|
38 const uint16_t *unfold; |
|
39 |
|
40 UTrie2 trie; |
|
41 uint8_t formatVersion[4]; |
|
42 }; |
|
43 |
|
44 /* ucase_props_data.h is machine-generated by gencase --csource */ |
|
45 #define INCLUDED_FROM_UCASE_CPP |
|
46 #include "ucase_props_data.h" |
|
47 |
|
48 /* UCaseProps singleton ----------------------------------------------------- */ |
|
49 |
|
50 U_CAPI const UCaseProps * U_EXPORT2 |
|
51 ucase_getSingleton() { |
|
52 return &ucase_props_singleton; |
|
53 } |
|
54 |
|
55 /* set of property starts for UnicodeSet ------------------------------------ */ |
|
56 |
|
57 static UBool U_CALLCONV |
|
58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { |
|
59 /* add the start code point to the USet */ |
|
60 const USetAdder *sa=(const USetAdder *)context; |
|
61 sa->add(sa->set, start); |
|
62 return TRUE; |
|
63 } |
|
64 |
|
65 U_CFUNC void U_EXPORT2 |
|
66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) { |
|
67 if(U_FAILURE(*pErrorCode)) { |
|
68 return; |
|
69 } |
|
70 |
|
71 /* add the start code point of each same-value range of the trie */ |
|
72 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); |
|
73 |
|
74 /* add code points with hardcoded properties, plus the ones following them */ |
|
75 |
|
76 /* (none right now, see comment below) */ |
|
77 |
|
78 /* |
|
79 * Omit code points with hardcoded specialcasing properties |
|
80 * because we do not build property UnicodeSets for them right now. |
|
81 */ |
|
82 } |
|
83 |
|
84 /* data access primitives --------------------------------------------------- */ |
|
85 |
|
86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) |
|
87 |
|
88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) |
|
89 |
|
90 /* number of bits in an 8-bit integer value */ |
|
91 static const uint8_t flagsOffset[256]={ |
|
92 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, |
|
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
96 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
99 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
100 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, |
|
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
102 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
104 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, |
|
105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
106 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, |
|
107 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 |
|
108 }; |
|
109 |
|
110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) |
|
111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] |
|
112 |
|
113 /* |
|
114 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). |
|
115 * |
|
116 * @param excWord (in) initial exceptions word |
|
117 * @param idx (in) desired slot index |
|
118 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; |
|
119 * moved to the last uint16_t of the value, use +1 for beginning of next slot |
|
120 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified |
|
121 */ |
|
122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ |
|
123 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ |
|
124 (pExc16)+=SLOT_OFFSET(excWord, idx); \ |
|
125 (value)=*pExc16; \ |
|
126 } else { \ |
|
127 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ |
|
128 (value)=*pExc16++; \ |
|
129 (value)=((value)<<16)|*pExc16; \ |
|
130 } |
|
131 |
|
132 /* simple case mappings ----------------------------------------------------- */ |
|
133 |
|
134 U_CAPI UChar32 U_EXPORT2 |
|
135 ucase_tolower(const UCaseProps *csp, UChar32 c) { |
|
136 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
137 if(!PROPS_HAS_EXCEPTION(props)) { |
|
138 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
|
139 c+=UCASE_GET_DELTA(props); |
|
140 } |
|
141 } else { |
|
142 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
|
143 uint16_t excWord=*pe++; |
|
144 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
|
145 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); |
|
146 } |
|
147 } |
|
148 return c; |
|
149 } |
|
150 |
|
151 U_CAPI UChar32 U_EXPORT2 |
|
152 ucase_toupper(const UCaseProps *csp, UChar32 c) { |
|
153 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
154 if(!PROPS_HAS_EXCEPTION(props)) { |
|
155 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
|
156 c+=UCASE_GET_DELTA(props); |
|
157 } |
|
158 } else { |
|
159 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
|
160 uint16_t excWord=*pe++; |
|
161 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
|
162 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); |
|
163 } |
|
164 } |
|
165 return c; |
|
166 } |
|
167 |
|
168 U_CAPI UChar32 U_EXPORT2 |
|
169 ucase_totitle(const UCaseProps *csp, UChar32 c) { |
|
170 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
171 if(!PROPS_HAS_EXCEPTION(props)) { |
|
172 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
|
173 c+=UCASE_GET_DELTA(props); |
|
174 } |
|
175 } else { |
|
176 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
|
177 uint16_t excWord=*pe++; |
|
178 int32_t idx; |
|
179 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { |
|
180 idx=UCASE_EXC_TITLE; |
|
181 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
|
182 idx=UCASE_EXC_UPPER; |
|
183 } else { |
|
184 return c; |
|
185 } |
|
186 GET_SLOT_VALUE(excWord, idx, pe, c); |
|
187 } |
|
188 return c; |
|
189 } |
|
190 |
|
191 static const UChar iDot[2] = { 0x69, 0x307 }; |
|
192 static const UChar jDot[2] = { 0x6a, 0x307 }; |
|
193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; |
|
194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; |
|
195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; |
|
196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; |
|
197 |
|
198 |
|
199 U_CFUNC void U_EXPORT2 |
|
200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { |
|
201 uint16_t props; |
|
202 |
|
203 /* |
|
204 * Hardcode the case closure of i and its relatives and ignore the |
|
205 * data file data for these characters. |
|
206 * The Turkic dotless i and dotted I with their case mapping conditions |
|
207 * and case folding option make the related characters behave specially. |
|
208 * This code matches their closure behavior to their case folding behavior. |
|
209 */ |
|
210 |
|
211 switch(c) { |
|
212 case 0x49: |
|
213 /* regular i and I are in one equivalence class */ |
|
214 sa->add(sa->set, 0x69); |
|
215 return; |
|
216 case 0x69: |
|
217 sa->add(sa->set, 0x49); |
|
218 return; |
|
219 case 0x130: |
|
220 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ |
|
221 sa->addString(sa->set, iDot, 2); |
|
222 return; |
|
223 case 0x131: |
|
224 /* dotless i is in a class by itself */ |
|
225 return; |
|
226 default: |
|
227 /* otherwise use the data file data */ |
|
228 break; |
|
229 } |
|
230 |
|
231 props=UTRIE2_GET16(&csp->trie, c); |
|
232 if(!PROPS_HAS_EXCEPTION(props)) { |
|
233 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { |
|
234 /* add the one simple case mapping, no matter what type it is */ |
|
235 int32_t delta=UCASE_GET_DELTA(props); |
|
236 if(delta!=0) { |
|
237 sa->add(sa->set, c+delta); |
|
238 } |
|
239 } |
|
240 } else { |
|
241 /* |
|
242 * c has exceptions, so there may be multiple simple and/or |
|
243 * full case mappings. Add them all. |
|
244 */ |
|
245 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); |
|
246 const UChar *closure; |
|
247 uint16_t excWord=*pe++; |
|
248 int32_t idx, closureLength, fullLength, length; |
|
249 |
|
250 pe0=pe; |
|
251 |
|
252 /* add all simple case mappings */ |
|
253 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { |
|
254 if(HAS_SLOT(excWord, idx)) { |
|
255 pe=pe0; |
|
256 GET_SLOT_VALUE(excWord, idx, pe, c); |
|
257 sa->add(sa->set, c); |
|
258 } |
|
259 } |
|
260 |
|
261 /* get the closure string pointer & length */ |
|
262 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { |
|
263 pe=pe0; |
|
264 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); |
|
265 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ |
|
266 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ |
|
267 } else { |
|
268 closureLength=0; |
|
269 closure=NULL; |
|
270 } |
|
271 |
|
272 /* add the full case folding */ |
|
273 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
|
274 pe=pe0; |
|
275 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); |
|
276 |
|
277 /* start of full case mapping strings */ |
|
278 ++pe; |
|
279 |
|
280 fullLength&=0xffff; /* bits 16 and higher are reserved */ |
|
281 |
|
282 /* skip the lowercase result string */ |
|
283 pe+=fullLength&UCASE_FULL_LOWER; |
|
284 fullLength>>=4; |
|
285 |
|
286 /* add the full case folding string */ |
|
287 length=fullLength&0xf; |
|
288 if(length!=0) { |
|
289 sa->addString(sa->set, (const UChar *)pe, length); |
|
290 pe+=length; |
|
291 } |
|
292 |
|
293 /* skip the uppercase and titlecase strings */ |
|
294 fullLength>>=4; |
|
295 pe+=fullLength&0xf; |
|
296 fullLength>>=4; |
|
297 pe+=fullLength; |
|
298 |
|
299 closure=(const UChar *)pe; /* behind full case mappings */ |
|
300 } |
|
301 |
|
302 /* add each code point in the closure string */ |
|
303 for(idx=0; idx<closureLength;) { |
|
304 U16_NEXT_UNSAFE(closure, idx, c); |
|
305 sa->add(sa->set, c); |
|
306 } |
|
307 } |
|
308 } |
|
309 |
|
310 /* |
|
311 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated |
|
312 * must be length>0 and max>0 and length<=max |
|
313 */ |
|
314 static inline int32_t |
|
315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { |
|
316 int32_t c1, c2; |
|
317 |
|
318 max-=length; /* we require length<=max, so no need to decrement max in the loop */ |
|
319 do { |
|
320 c1=*s++; |
|
321 c2=*t++; |
|
322 if(c2==0) { |
|
323 return 1; /* reached the end of t but not of s */ |
|
324 } |
|
325 c1-=c2; |
|
326 if(c1!=0) { |
|
327 return c1; /* return difference result */ |
|
328 } |
|
329 } while(--length>0); |
|
330 /* ends with length==0 */ |
|
331 |
|
332 if(max==0 || *t==0) { |
|
333 return 0; /* equal to length of both strings */ |
|
334 } else { |
|
335 return -max; /* return lengh difference */ |
|
336 } |
|
337 } |
|
338 |
|
339 U_CFUNC UBool U_EXPORT2 |
|
340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) { |
|
341 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; |
|
342 |
|
343 if(csp->unfold==NULL || s==NULL) { |
|
344 return FALSE; /* no reverse case folding data, or no string */ |
|
345 } |
|
346 if(length<=1) { |
|
347 /* the string is too short to find any match */ |
|
348 /* |
|
349 * more precise would be: |
|
350 * if(!u_strHasMoreChar32Than(s, length, 1)) |
|
351 * but this does not make much practical difference because |
|
352 * a single supplementary code point would just not be found |
|
353 */ |
|
354 return FALSE; |
|
355 } |
|
356 |
|
357 const uint16_t *unfold=csp->unfold; |
|
358 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; |
|
359 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; |
|
360 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; |
|
361 unfold+=unfoldRowWidth; |
|
362 |
|
363 if(length>unfoldStringWidth) { |
|
364 /* the string is too long to find any match */ |
|
365 return FALSE; |
|
366 } |
|
367 |
|
368 /* do a binary search for the string */ |
|
369 start=0; |
|
370 limit=unfoldRows; |
|
371 while(start<limit) { |
|
372 i=(start+limit)/2; |
|
373 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth)); |
|
374 result=strcmpMax(s, length, p, unfoldStringWidth); |
|
375 |
|
376 if(result==0) { |
|
377 /* found the string: add each code point, and its case closure */ |
|
378 UChar32 c; |
|
379 |
|
380 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { |
|
381 U16_NEXT_UNSAFE(p, i, c); |
|
382 sa->add(sa->set, c); |
|
383 ucase_addCaseClosure(csp, c, sa); |
|
384 } |
|
385 return TRUE; |
|
386 } else if(result<0) { |
|
387 limit=i; |
|
388 } else /* result>0 */ { |
|
389 start=i+1; |
|
390 } |
|
391 } |
|
392 |
|
393 return FALSE; /* string not found */ |
|
394 } |
|
395 |
|
396 U_NAMESPACE_BEGIN |
|
397 |
|
398 FullCaseFoldingIterator::FullCaseFoldingIterator() |
|
399 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)), |
|
400 unfoldRows(unfold[UCASE_UNFOLD_ROWS]), |
|
401 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]), |
|
402 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]), |
|
403 currentRow(0), |
|
404 rowCpIndex(unfoldStringWidth) { |
|
405 unfold+=unfoldRowWidth; |
|
406 } |
|
407 |
|
408 UChar32 |
|
409 FullCaseFoldingIterator::next(UnicodeString &full) { |
|
410 // Advance past the last-delivered code point. |
|
411 const UChar *p=unfold+(currentRow*unfoldRowWidth); |
|
412 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) { |
|
413 ++currentRow; |
|
414 p+=unfoldRowWidth; |
|
415 rowCpIndex=unfoldStringWidth; |
|
416 } |
|
417 if(currentRow>=unfoldRows) { return U_SENTINEL; } |
|
418 // Set "full" to the NUL-terminated string in the first unfold column. |
|
419 int32_t length=unfoldStringWidth; |
|
420 while(length>0 && p[length-1]==0) { --length; } |
|
421 full.setTo(FALSE, p, length); |
|
422 // Return the code point. |
|
423 UChar32 c; |
|
424 U16_NEXT_UNSAFE(p, rowCpIndex, c); |
|
425 return c; |
|
426 } |
|
427 |
|
428 U_NAMESPACE_END |
|
429 |
|
430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ |
|
431 U_CAPI int32_t U_EXPORT2 |
|
432 ucase_getType(const UCaseProps *csp, UChar32 c) { |
|
433 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
434 return UCASE_GET_TYPE(props); |
|
435 } |
|
436 |
|
437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ |
|
438 U_CAPI int32_t U_EXPORT2 |
|
439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { |
|
440 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
441 return UCASE_GET_TYPE_AND_IGNORABLE(props); |
|
442 } |
|
443 |
|
444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ |
|
445 static inline int32_t |
|
446 getDotType(const UCaseProps *csp, UChar32 c) { |
|
447 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
448 if(!PROPS_HAS_EXCEPTION(props)) { |
|
449 return props&UCASE_DOT_MASK; |
|
450 } else { |
|
451 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
|
452 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; |
|
453 } |
|
454 } |
|
455 |
|
456 U_CAPI UBool U_EXPORT2 |
|
457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { |
|
458 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); |
|
459 } |
|
460 |
|
461 U_CAPI UBool U_EXPORT2 |
|
462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { |
|
463 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
464 return (UBool)((props&UCASE_SENSITIVE)!=0); |
|
465 } |
|
466 |
|
467 /* string casing ------------------------------------------------------------ */ |
|
468 |
|
469 /* |
|
470 * These internal functions form the core of string case mappings. |
|
471 * They map single code points to result code points or strings and take |
|
472 * all necessary conditions (context, locale ID, options) into account. |
|
473 * |
|
474 * They do not iterate over the source or write to the destination |
|
475 * so that the same functions are useful for non-standard string storage, |
|
476 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. |
|
477 * For the same reason, the "surrounding text" context is passed in as a |
|
478 * UCaseContextIterator which does not make any assumptions about |
|
479 * the underlying storage. |
|
480 * |
|
481 * This section contains helper functions that check for conditions |
|
482 * in the input text surrounding the current code point |
|
483 * according to SpecialCasing.txt. |
|
484 * |
|
485 * Each helper function gets the index |
|
486 * - after the current code point if it looks at following text |
|
487 * - before the current code point if it looks at preceding text |
|
488 * |
|
489 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: |
|
490 * |
|
491 * Final_Sigma |
|
492 * C is preceded by a sequence consisting of |
|
493 * a cased letter and a case-ignorable sequence, |
|
494 * and C is not followed by a sequence consisting of |
|
495 * an ignorable sequence and then a cased letter. |
|
496 * |
|
497 * More_Above |
|
498 * C is followed by one or more characters of combining class 230 (ABOVE) |
|
499 * in the combining character sequence. |
|
500 * |
|
501 * After_Soft_Dotted |
|
502 * The last preceding character with combining class of zero before C |
|
503 * was Soft_Dotted, |
|
504 * and there is no intervening combining character class 230 (ABOVE). |
|
505 * |
|
506 * Before_Dot |
|
507 * C is followed by combining dot above (U+0307). |
|
508 * Any sequence of characters with a combining class that is neither 0 nor 230 |
|
509 * may intervene between the current character and the combining dot above. |
|
510 * |
|
511 * The erratum from 2002-10-31 adds the condition |
|
512 * |
|
513 * After_I |
|
514 * The last preceding base character was an uppercase I, and there is no |
|
515 * intervening combining character class 230 (ABOVE). |
|
516 * |
|
517 * (See Jitterbug 2344 and the comments on After_I below.) |
|
518 * |
|
519 * Helper definitions in Unicode 3.2 UAX 21: |
|
520 * |
|
521 * D1. A character C is defined to be cased |
|
522 * if it meets any of the following criteria: |
|
523 * |
|
524 * - The general category of C is Titlecase Letter (Lt) |
|
525 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase |
|
526 * - Given D = NFD(C), then it is not the case that: |
|
527 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) |
|
528 * (This third criterium does not add any characters to the list |
|
529 * for Unicode 3.2. Ignored.) |
|
530 * |
|
531 * D2. A character C is defined to be case-ignorable |
|
532 * if it meets either of the following criteria: |
|
533 * |
|
534 * - The general category of C is |
|
535 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or |
|
536 * Letter Modifier (Lm), or Symbol Modifier (Sk) |
|
537 * - C is one of the following characters |
|
538 * U+0027 APOSTROPHE |
|
539 * U+00AD SOFT HYPHEN (SHY) |
|
540 * U+2019 RIGHT SINGLE QUOTATION MARK |
|
541 * (the preferred character for apostrophe) |
|
542 * |
|
543 * D3. A case-ignorable sequence is a sequence of |
|
544 * zero or more case-ignorable characters. |
|
545 */ |
|
546 |
|
547 #define is_a(c) ((c)=='a' || (c)=='A') |
|
548 #define is_d(c) ((c)=='d' || (c)=='D') |
|
549 #define is_e(c) ((c)=='e' || (c)=='E') |
|
550 #define is_i(c) ((c)=='i' || (c)=='I') |
|
551 #define is_l(c) ((c)=='l' || (c)=='L') |
|
552 #define is_n(c) ((c)=='n' || (c)=='N') |
|
553 #define is_r(c) ((c)=='r' || (c)=='R') |
|
554 #define is_t(c) ((c)=='t' || (c)=='T') |
|
555 #define is_u(c) ((c)=='u' || (c)=='U') |
|
556 #define is_z(c) ((c)=='z' || (c)=='Z') |
|
557 |
|
558 /* separator? */ |
|
559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) |
|
560 |
|
561 /** |
|
562 * Requires non-NULL locale ID but otherwise does the equivalent of |
|
563 * checking for language codes as if uloc_getLanguage() were called: |
|
564 * Accepts both 2- and 3-letter codes and accepts case variants. |
|
565 */ |
|
566 U_CFUNC int32_t |
|
567 ucase_getCaseLocale(const char *locale, int32_t *locCache) { |
|
568 int32_t result; |
|
569 char c; |
|
570 |
|
571 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) { |
|
572 return result; |
|
573 } |
|
574 |
|
575 result=UCASE_LOC_ROOT; |
|
576 |
|
577 /* |
|
578 * This function used to use uloc_getLanguage(), but the current code |
|
579 * removes the dependency of this low-level code on uloc implementation code |
|
580 * and is faster because not the whole locale ID has to be |
|
581 * examined and copied/transformed. |
|
582 * |
|
583 * Because this code does not want to depend on uloc, the caller must |
|
584 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). |
|
585 */ |
|
586 c=*locale++; |
|
587 if(is_t(c)) { |
|
588 /* tr or tur? */ |
|
589 c=*locale++; |
|
590 if(is_u(c)) { |
|
591 c=*locale++; |
|
592 } |
|
593 if(is_r(c)) { |
|
594 c=*locale; |
|
595 if(is_sep(c)) { |
|
596 result=UCASE_LOC_TURKISH; |
|
597 } |
|
598 } |
|
599 } else if(is_a(c)) { |
|
600 /* az or aze? */ |
|
601 c=*locale++; |
|
602 if(is_z(c)) { |
|
603 c=*locale++; |
|
604 if(is_e(c)) { |
|
605 c=*locale; |
|
606 } |
|
607 if(is_sep(c)) { |
|
608 result=UCASE_LOC_TURKISH; |
|
609 } |
|
610 } |
|
611 } else if(is_l(c)) { |
|
612 /* lt or lit? */ |
|
613 c=*locale++; |
|
614 if(is_i(c)) { |
|
615 c=*locale++; |
|
616 } |
|
617 if(is_t(c)) { |
|
618 c=*locale; |
|
619 if(is_sep(c)) { |
|
620 result=UCASE_LOC_LITHUANIAN; |
|
621 } |
|
622 } |
|
623 } else if(is_n(c)) { |
|
624 /* nl or nld? */ |
|
625 c=*locale++; |
|
626 if(is_l(c)) { |
|
627 c=*locale++; |
|
628 if(is_d(c)) { |
|
629 c=*locale; |
|
630 } |
|
631 if(is_sep(c)) { |
|
632 result=UCASE_LOC_DUTCH; |
|
633 } |
|
634 } |
|
635 } |
|
636 |
|
637 if(locCache!=NULL) { |
|
638 *locCache=result; |
|
639 } |
|
640 return result; |
|
641 } |
|
642 |
|
643 /* |
|
644 * Is followed by |
|
645 * {case-ignorable}* cased |
|
646 * ? |
|
647 * (dir determines looking forward/backward) |
|
648 * If a character is case-ignorable, it is skipped regardless of whether |
|
649 * it is also cased or not. |
|
650 */ |
|
651 static UBool |
|
652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) { |
|
653 UChar32 c; |
|
654 |
|
655 if(iter==NULL) { |
|
656 return FALSE; |
|
657 } |
|
658 |
|
659 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { |
|
660 int32_t type=ucase_getTypeOrIgnorable(csp, c); |
|
661 if(type&4) { |
|
662 /* case-ignorable, continue with the loop */ |
|
663 } else if(type!=UCASE_NONE) { |
|
664 return TRUE; /* followed by cased letter */ |
|
665 } else { |
|
666 return FALSE; /* uncased and not case-ignorable */ |
|
667 } |
|
668 } |
|
669 |
|
670 return FALSE; /* not followed by cased letter */ |
|
671 } |
|
672 |
|
673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ |
|
674 static UBool |
|
675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { |
|
676 UChar32 c; |
|
677 int32_t dotType; |
|
678 int8_t dir; |
|
679 |
|
680 if(iter==NULL) { |
|
681 return FALSE; |
|
682 } |
|
683 |
|
684 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { |
|
685 dotType=getDotType(csp, c); |
|
686 if(dotType==UCASE_SOFT_DOTTED) { |
|
687 return TRUE; /* preceded by TYPE_i */ |
|
688 } else if(dotType!=UCASE_OTHER_ACCENT) { |
|
689 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ |
|
690 } |
|
691 } |
|
692 |
|
693 return FALSE; /* not preceded by TYPE_i */ |
|
694 } |
|
695 |
|
696 /* |
|
697 * See Jitterbug 2344: |
|
698 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above |
|
699 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because |
|
700 * we made those releases compatible with Unicode 3.2 which had not fixed |
|
701 * a related bug in SpecialCasing.txt. |
|
702 * |
|
703 * From the Jitterbug 2344 text: |
|
704 * ... this bug is listed as a Unicode erratum |
|
705 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html |
|
706 * <quote> |
|
707 * There are two errors in SpecialCasing.txt. |
|
708 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] |
|
709 * 2. An incorrect context definition. Correct as follows: |
|
710 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE |
|
711 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE |
|
712 * --- |
|
713 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
|
714 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE |
|
715 * where the context After_I is defined as: |
|
716 * The last preceding base character was an uppercase I, and there is no |
|
717 * intervening combining character class 230 (ABOVE). |
|
718 * </quote> |
|
719 * |
|
720 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: |
|
721 * |
|
722 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. |
|
723 * # This matches the behavior of the canonically equivalent I-dot_above |
|
724 * |
|
725 * See also the description in this place in older versions of uchar.c (revision 1.100). |
|
726 * |
|
727 * Markus W. Scherer 2003-feb-15 |
|
728 */ |
|
729 |
|
730 /* Is preceded by base character 'I' with no intervening cc=230 ? */ |
|
731 static UBool |
|
732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { |
|
733 UChar32 c; |
|
734 int32_t dotType; |
|
735 int8_t dir; |
|
736 |
|
737 if(iter==NULL) { |
|
738 return FALSE; |
|
739 } |
|
740 |
|
741 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { |
|
742 if(c==0x49) { |
|
743 return TRUE; /* preceded by I */ |
|
744 } |
|
745 dotType=getDotType(csp, c); |
|
746 if(dotType!=UCASE_OTHER_ACCENT) { |
|
747 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ |
|
748 } |
|
749 } |
|
750 |
|
751 return FALSE; /* not preceded by I */ |
|
752 } |
|
753 |
|
754 /* Is followed by one or more cc==230 ? */ |
|
755 static UBool |
|
756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { |
|
757 UChar32 c; |
|
758 int32_t dotType; |
|
759 int8_t dir; |
|
760 |
|
761 if(iter==NULL) { |
|
762 return FALSE; |
|
763 } |
|
764 |
|
765 for(dir=1; (c=iter(context, dir))>=0; dir=0) { |
|
766 dotType=getDotType(csp, c); |
|
767 if(dotType==UCASE_ABOVE) { |
|
768 return TRUE; /* at least one cc==230 following */ |
|
769 } else if(dotType!=UCASE_OTHER_ACCENT) { |
|
770 return FALSE; /* next base character, no more cc==230 following */ |
|
771 } |
|
772 } |
|
773 |
|
774 return FALSE; /* no more cc==230 following */ |
|
775 } |
|
776 |
|
777 /* Is followed by a dot above (without cc==230 in between) ? */ |
|
778 static UBool |
|
779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { |
|
780 UChar32 c; |
|
781 int32_t dotType; |
|
782 int8_t dir; |
|
783 |
|
784 if(iter==NULL) { |
|
785 return FALSE; |
|
786 } |
|
787 |
|
788 for(dir=1; (c=iter(context, dir))>=0; dir=0) { |
|
789 if(c==0x307) { |
|
790 return TRUE; |
|
791 } |
|
792 dotType=getDotType(csp, c); |
|
793 if(dotType!=UCASE_OTHER_ACCENT) { |
|
794 return FALSE; /* next base character or cc==230 in between */ |
|
795 } |
|
796 } |
|
797 |
|
798 return FALSE; /* no dot above following */ |
|
799 } |
|
800 |
|
801 U_CAPI int32_t U_EXPORT2 |
|
802 ucase_toFullLower(const UCaseProps *csp, UChar32 c, |
|
803 UCaseContextIterator *iter, void *context, |
|
804 const UChar **pString, |
|
805 const char *locale, int32_t *locCache) |
|
806 { |
|
807 UChar32 result=c; |
|
808 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
809 if(!PROPS_HAS_EXCEPTION(props)) { |
|
810 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
|
811 result=c+UCASE_GET_DELTA(props); |
|
812 } |
|
813 } else { |
|
814 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
|
815 uint16_t excWord=*pe++; |
|
816 int32_t full; |
|
817 |
|
818 pe2=pe; |
|
819 |
|
820 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { |
|
821 /* use hardcoded conditions and mappings */ |
|
822 int32_t loc=ucase_getCaseLocale(locale, locCache); |
|
823 |
|
824 /* |
|
825 * Test for conditional mappings first |
|
826 * (otherwise the unconditional default mappings are always taken), |
|
827 * then test for characters that have unconditional mappings in SpecialCasing.txt, |
|
828 * then get the UnicodeData.txt mappings. |
|
829 */ |
|
830 if( loc==UCASE_LOC_LITHUANIAN && |
|
831 /* base characters, find accents above */ |
|
832 (((c==0x49 || c==0x4a || c==0x12e) && |
|
833 isFollowedByMoreAbove(csp, iter, context)) || |
|
834 /* precomposed with accent above, no need to find one */ |
|
835 (c==0xcc || c==0xcd || c==0x128)) |
|
836 ) { |
|
837 /* |
|
838 # Lithuanian |
|
839 |
|
840 # Lithuanian retains the dot in a lowercase i when followed by accents. |
|
841 |
|
842 # Introduce an explicit dot above when lowercasing capital I's and J's |
|
843 # whenever there are more accents above. |
|
844 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) |
|
845 |
|
846 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I |
|
847 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J |
|
848 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK |
|
849 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE |
|
850 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE |
|
851 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE |
|
852 */ |
|
853 switch(c) { |
|
854 case 0x49: /* LATIN CAPITAL LETTER I */ |
|
855 *pString=iDot; |
|
856 return 2; |
|
857 case 0x4a: /* LATIN CAPITAL LETTER J */ |
|
858 *pString=jDot; |
|
859 return 2; |
|
860 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ |
|
861 *pString=iOgonekDot; |
|
862 return 2; |
|
863 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ |
|
864 *pString=iDotGrave; |
|
865 return 3; |
|
866 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ |
|
867 *pString=iDotAcute; |
|
868 return 3; |
|
869 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ |
|
870 *pString=iDotTilde; |
|
871 return 3; |
|
872 default: |
|
873 return 0; /* will not occur */ |
|
874 } |
|
875 /* # Turkish and Azeri */ |
|
876 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { |
|
877 /* |
|
878 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri |
|
879 # The following rules handle those cases. |
|
880 |
|
881 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
882 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
883 */ |
|
884 return 0x69; |
|
885 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { |
|
886 /* |
|
887 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. |
|
888 # This matches the behavior of the canonically equivalent I-dot_above |
|
889 |
|
890 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
|
891 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE |
|
892 */ |
|
893 return 0; /* remove the dot (continue without output) */ |
|
894 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { |
|
895 /* |
|
896 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. |
|
897 |
|
898 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I |
|
899 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I |
|
900 */ |
|
901 return 0x131; |
|
902 } else if(c==0x130) { |
|
903 /* |
|
904 # Preserve canonical equivalence for I with dot. Turkic is handled below. |
|
905 |
|
906 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
907 */ |
|
908 *pString=iDot; |
|
909 return 2; |
|
910 } else if( c==0x3a3 && |
|
911 !isFollowedByCasedLetter(csp, iter, context, 1) && |
|
912 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ |
|
913 ) { |
|
914 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ |
|
915 /* |
|
916 # Special case for final form of sigma |
|
917 |
|
918 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA |
|
919 */ |
|
920 return 0x3c2; /* greek small final sigma */ |
|
921 } else { |
|
922 /* no known conditional special case mapping, use a normal mapping */ |
|
923 } |
|
924 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
|
925 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
|
926 full&=UCASE_FULL_LOWER; |
|
927 if(full!=0) { |
|
928 /* set the output pointer to the lowercase mapping */ |
|
929 *pString=reinterpret_cast<const UChar *>(pe+1); |
|
930 |
|
931 /* return the string length */ |
|
932 return full; |
|
933 } |
|
934 } |
|
935 |
|
936 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
|
937 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); |
|
938 } |
|
939 } |
|
940 |
|
941 return (result==c) ? ~result : result; |
|
942 } |
|
943 |
|
944 /* internal */ |
|
945 static int32_t |
|
946 toUpperOrTitle(const UCaseProps *csp, UChar32 c, |
|
947 UCaseContextIterator *iter, void *context, |
|
948 const UChar **pString, |
|
949 const char *locale, int32_t *locCache, |
|
950 UBool upperNotTitle) { |
|
951 UChar32 result=c; |
|
952 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
953 if(!PROPS_HAS_EXCEPTION(props)) { |
|
954 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { |
|
955 result=c+UCASE_GET_DELTA(props); |
|
956 } |
|
957 } else { |
|
958 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
|
959 uint16_t excWord=*pe++; |
|
960 int32_t full, idx; |
|
961 |
|
962 pe2=pe; |
|
963 |
|
964 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { |
|
965 /* use hardcoded conditions and mappings */ |
|
966 int32_t loc=ucase_getCaseLocale(locale, locCache); |
|
967 |
|
968 if(loc==UCASE_LOC_TURKISH && c==0x69) { |
|
969 /* |
|
970 # Turkish and Azeri |
|
971 |
|
972 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri |
|
973 # The following rules handle those cases. |
|
974 |
|
975 # When uppercasing, i turns into a dotted capital I |
|
976 |
|
977 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I |
|
978 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I |
|
979 */ |
|
980 return 0x130; |
|
981 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { |
|
982 /* |
|
983 # Lithuanian |
|
984 |
|
985 # Lithuanian retains the dot in a lowercase i when followed by accents. |
|
986 |
|
987 # Remove DOT ABOVE after "i" with upper or titlecase |
|
988 |
|
989 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE |
|
990 */ |
|
991 return 0; /* remove the dot (continue without output) */ |
|
992 } else { |
|
993 /* no known conditional special case mapping, use a normal mapping */ |
|
994 } |
|
995 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
|
996 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
|
997 |
|
998 /* start of full case mapping strings */ |
|
999 ++pe; |
|
1000 |
|
1001 /* skip the lowercase and case-folding result strings */ |
|
1002 pe+=full&UCASE_FULL_LOWER; |
|
1003 full>>=4; |
|
1004 pe+=full&0xf; |
|
1005 full>>=4; |
|
1006 |
|
1007 if(upperNotTitle) { |
|
1008 full&=0xf; |
|
1009 } else { |
|
1010 /* skip the uppercase result string */ |
|
1011 pe+=full&0xf; |
|
1012 full=(full>>4)&0xf; |
|
1013 } |
|
1014 |
|
1015 if(full!=0) { |
|
1016 /* set the output pointer to the result string */ |
|
1017 *pString=reinterpret_cast<const UChar *>(pe); |
|
1018 |
|
1019 /* return the string length */ |
|
1020 return full; |
|
1021 } |
|
1022 } |
|
1023 |
|
1024 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { |
|
1025 idx=UCASE_EXC_TITLE; |
|
1026 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { |
|
1027 /* here, titlecase is same as uppercase */ |
|
1028 idx=UCASE_EXC_UPPER; |
|
1029 } else { |
|
1030 return ~c; |
|
1031 } |
|
1032 GET_SLOT_VALUE(excWord, idx, pe2, result); |
|
1033 } |
|
1034 |
|
1035 return (result==c) ? ~result : result; |
|
1036 } |
|
1037 |
|
1038 U_CAPI int32_t U_EXPORT2 |
|
1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c, |
|
1040 UCaseContextIterator *iter, void *context, |
|
1041 const UChar **pString, |
|
1042 const char *locale, int32_t *locCache) { |
|
1043 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE); |
|
1044 } |
|
1045 |
|
1046 U_CAPI int32_t U_EXPORT2 |
|
1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c, |
|
1048 UCaseContextIterator *iter, void *context, |
|
1049 const UChar **pString, |
|
1050 const char *locale, int32_t *locCache) { |
|
1051 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE); |
|
1052 } |
|
1053 |
|
1054 /* case folding ------------------------------------------------------------- */ |
|
1055 |
|
1056 /* |
|
1057 * Case folding is similar to lowercasing. |
|
1058 * The result may be a simple mapping, i.e., a single code point, or |
|
1059 * a full mapping, i.e., a string. |
|
1060 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, |
|
1061 * then only the lowercase mapping is stored. |
|
1062 * |
|
1063 * Some special cases are hardcoded because their conditions cannot be |
|
1064 * parsed and processed from CaseFolding.txt. |
|
1065 * |
|
1066 * Unicode 3.2 CaseFolding.txt specifies for its status field: |
|
1067 |
|
1068 # C: common case folding, common mappings shared by both simple and full mappings. |
|
1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. |
|
1070 # S: simple case folding, mappings to single characters where different from F. |
|
1071 # T: special case for uppercase I and dotted uppercase I |
|
1072 # - For non-Turkic languages, this mapping is normally not used. |
|
1073 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. |
|
1074 # |
|
1075 # Usage: |
|
1076 # A. To do a simple case folding, use the mappings with status C + S. |
|
1077 # B. To do a full case folding, use the mappings with status C + F. |
|
1078 # |
|
1079 # The mappings with status T can be used or omitted depending on the desired case-folding |
|
1080 # behavior. (The default option is to exclude them.) |
|
1081 |
|
1082 * Unicode 3.2 has 'T' mappings as follows: |
|
1083 |
|
1084 0049; T; 0131; # LATIN CAPITAL LETTER I |
|
1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
1086 |
|
1087 * while the default mappings for these code points are: |
|
1088 |
|
1089 0049; C; 0069; # LATIN CAPITAL LETTER I |
|
1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
1091 |
|
1092 * U+0130 has no simple case folding (simple-case-folds to itself). |
|
1093 */ |
|
1094 |
|
1095 /* return the simple case folding mapping for c */ |
|
1096 U_CAPI UChar32 U_EXPORT2 |
|
1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { |
|
1098 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
1099 if(!PROPS_HAS_EXCEPTION(props)) { |
|
1100 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
|
1101 c+=UCASE_GET_DELTA(props); |
|
1102 } |
|
1103 } else { |
|
1104 const uint16_t *pe=GET_EXCEPTIONS(csp, props); |
|
1105 uint16_t excWord=*pe++; |
|
1106 int32_t idx; |
|
1107 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { |
|
1108 /* special case folding mappings, hardcoded */ |
|
1109 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { |
|
1110 /* default mappings */ |
|
1111 if(c==0x49) { |
|
1112 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ |
|
1113 return 0x69; |
|
1114 } else if(c==0x130) { |
|
1115 /* no simple case folding for U+0130 */ |
|
1116 return c; |
|
1117 } |
|
1118 } else { |
|
1119 /* Turkic mappings */ |
|
1120 if(c==0x49) { |
|
1121 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ |
|
1122 return 0x131; |
|
1123 } else if(c==0x130) { |
|
1124 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
|
1125 return 0x69; |
|
1126 } |
|
1127 } |
|
1128 } |
|
1129 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { |
|
1130 idx=UCASE_EXC_FOLD; |
|
1131 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
|
1132 idx=UCASE_EXC_LOWER; |
|
1133 } else { |
|
1134 return c; |
|
1135 } |
|
1136 GET_SLOT_VALUE(excWord, idx, pe, c); |
|
1137 } |
|
1138 return c; |
|
1139 } |
|
1140 |
|
1141 /* |
|
1142 * Issue for canonical caseless match (UAX #21): |
|
1143 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve |
|
1144 * canonical equivalence, unlike default-option casefolding. |
|
1145 * For example, I-grave and I + grave fold to strings that are not canonically |
|
1146 * equivalent. |
|
1147 * For more details, see the comment in unorm_compare() in unorm.cpp |
|
1148 * and the intermediate prototype changes for Jitterbug 2021. |
|
1149 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) |
|
1150 * |
|
1151 * This did not get fixed because it appears that it is not possible to fix |
|
1152 * it for uppercase and lowercase characters (I-grave vs. i-grave) |
|
1153 * together in a way that they still fold to common result strings. |
|
1154 */ |
|
1155 |
|
1156 U_CAPI int32_t U_EXPORT2 |
|
1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, |
|
1158 const UChar **pString, |
|
1159 uint32_t options) |
|
1160 { |
|
1161 UChar32 result=c; |
|
1162 uint16_t props=UTRIE2_GET16(&csp->trie, c); |
|
1163 if(!PROPS_HAS_EXCEPTION(props)) { |
|
1164 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { |
|
1165 result=c+UCASE_GET_DELTA(props); |
|
1166 } |
|
1167 } else { |
|
1168 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; |
|
1169 uint16_t excWord=*pe++; |
|
1170 int32_t full, idx; |
|
1171 |
|
1172 pe2=pe; |
|
1173 |
|
1174 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { |
|
1175 /* use hardcoded conditions and mappings */ |
|
1176 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { |
|
1177 /* default mappings */ |
|
1178 if(c==0x49) { |
|
1179 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ |
|
1180 return 0x69; |
|
1181 } else if(c==0x130) { |
|
1182 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
|
1183 *pString=iDot; |
|
1184 return 2; |
|
1185 } |
|
1186 } else { |
|
1187 /* Turkic mappings */ |
|
1188 if(c==0x49) { |
|
1189 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ |
|
1190 return 0x131; |
|
1191 } else if(c==0x130) { |
|
1192 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
|
1193 return 0x69; |
|
1194 } |
|
1195 } |
|
1196 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { |
|
1197 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); |
|
1198 |
|
1199 /* start of full case mapping strings */ |
|
1200 ++pe; |
|
1201 |
|
1202 /* skip the lowercase result string */ |
|
1203 pe+=full&UCASE_FULL_LOWER; |
|
1204 full=(full>>4)&0xf; |
|
1205 |
|
1206 if(full!=0) { |
|
1207 /* set the output pointer to the result string */ |
|
1208 *pString=reinterpret_cast<const UChar *>(pe); |
|
1209 |
|
1210 /* return the string length */ |
|
1211 return full; |
|
1212 } |
|
1213 } |
|
1214 |
|
1215 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { |
|
1216 idx=UCASE_EXC_FOLD; |
|
1217 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { |
|
1218 idx=UCASE_EXC_LOWER; |
|
1219 } else { |
|
1220 return ~c; |
|
1221 } |
|
1222 GET_SLOT_VALUE(excWord, idx, pe2, result); |
|
1223 } |
|
1224 |
|
1225 return (result==c) ? ~result : result; |
|
1226 } |
|
1227 |
|
1228 /* case mapping properties API ---------------------------------------------- */ |
|
1229 |
|
1230 #define GET_CASE_PROPS() &ucase_props_singleton |
|
1231 |
|
1232 /* public API (see uchar.h) */ |
|
1233 |
|
1234 U_CAPI UBool U_EXPORT2 |
|
1235 u_isULowercase(UChar32 c) { |
|
1236 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c)); |
|
1237 } |
|
1238 |
|
1239 U_CAPI UBool U_EXPORT2 |
|
1240 u_isUUppercase(UChar32 c) { |
|
1241 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c)); |
|
1242 } |
|
1243 |
|
1244 /* Transforms the Unicode character to its lower case equivalent.*/ |
|
1245 U_CAPI UChar32 U_EXPORT2 |
|
1246 u_tolower(UChar32 c) { |
|
1247 return ucase_tolower(GET_CASE_PROPS(), c); |
|
1248 } |
|
1249 |
|
1250 /* Transforms the Unicode character to its upper case equivalent.*/ |
|
1251 U_CAPI UChar32 U_EXPORT2 |
|
1252 u_toupper(UChar32 c) { |
|
1253 return ucase_toupper(GET_CASE_PROPS(), c); |
|
1254 } |
|
1255 |
|
1256 /* Transforms the Unicode character to its title case equivalent.*/ |
|
1257 U_CAPI UChar32 U_EXPORT2 |
|
1258 u_totitle(UChar32 c) { |
|
1259 return ucase_totitle(GET_CASE_PROPS(), c); |
|
1260 } |
|
1261 |
|
1262 /* return the simple case folding mapping for c */ |
|
1263 U_CAPI UChar32 U_EXPORT2 |
|
1264 u_foldCase(UChar32 c, uint32_t options) { |
|
1265 return ucase_fold(GET_CASE_PROPS(), c, options); |
|
1266 } |
|
1267 |
|
1268 U_CFUNC int32_t U_EXPORT2 |
|
1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) { |
|
1270 /* case mapping properties */ |
|
1271 const UChar *resultString; |
|
1272 int32_t locCache; |
|
1273 const UCaseProps *csp=GET_CASE_PROPS(); |
|
1274 if(csp==NULL) { |
|
1275 return FALSE; |
|
1276 } |
|
1277 switch(which) { |
|
1278 case UCHAR_LOWERCASE: |
|
1279 return (UBool)(UCASE_LOWER==ucase_getType(csp, c)); |
|
1280 case UCHAR_UPPERCASE: |
|
1281 return (UBool)(UCASE_UPPER==ucase_getType(csp, c)); |
|
1282 case UCHAR_SOFT_DOTTED: |
|
1283 return ucase_isSoftDotted(csp, c); |
|
1284 case UCHAR_CASE_SENSITIVE: |
|
1285 return ucase_isCaseSensitive(csp, c); |
|
1286 case UCHAR_CASED: |
|
1287 return (UBool)(UCASE_NONE!=ucase_getType(csp, c)); |
|
1288 case UCHAR_CASE_IGNORABLE: |
|
1289 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2); |
|
1290 /* |
|
1291 * Note: The following Changes_When_Xyz are defined as testing whether |
|
1292 * the NFD form of the input changes when Xyz-case-mapped. |
|
1293 * However, this simpler implementation of these properties, |
|
1294 * ignoring NFD, passes the tests. |
|
1295 * The implementation needs to be changed if the tests start failing. |
|
1296 * When that happens, optimizations should be used to work with the |
|
1297 * per-single-code point ucase_toFullXyz() functions unless |
|
1298 * the NFD form has more than one code point, |
|
1299 * and the property starts set needs to be the union of the |
|
1300 * start sets for normalization and case mappings. |
|
1301 */ |
|
1302 case UCHAR_CHANGES_WHEN_LOWERCASED: |
|
1303 locCache=UCASE_LOC_ROOT; |
|
1304 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); |
|
1305 case UCHAR_CHANGES_WHEN_UPPERCASED: |
|
1306 locCache=UCASE_LOC_ROOT; |
|
1307 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); |
|
1308 case UCHAR_CHANGES_WHEN_TITLECASED: |
|
1309 locCache=UCASE_LOC_ROOT; |
|
1310 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); |
|
1311 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ |
|
1312 case UCHAR_CHANGES_WHEN_CASEMAPPED: |
|
1313 locCache=UCASE_LOC_ROOT; |
|
1314 return (UBool)( |
|
1315 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || |
|
1316 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || |
|
1317 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); |
|
1318 default: |
|
1319 return FALSE; |
|
1320 } |
|
1321 } |