|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2001-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: ustrcase.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002feb20 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Implementation file for string casing C API functions. |
|
17 * Uses functions from uchar.c for basic functionality that requires access |
|
18 * to the Unicode Character Database (uprops.dat). |
|
19 */ |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 #include "unicode/brkiter.h" |
|
23 #include "unicode/ustring.h" |
|
24 #include "unicode/ucasemap.h" |
|
25 #include "unicode/ubrk.h" |
|
26 #include "unicode/utf.h" |
|
27 #include "unicode/utf16.h" |
|
28 #include "cmemory.h" |
|
29 #include "ucase.h" |
|
30 #include "ustr_imp.h" |
|
31 |
|
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
33 |
|
34 U_NAMESPACE_USE |
|
35 |
|
36 /* string casing ------------------------------------------------------------ */ |
|
37 |
|
38 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
|
39 static inline int32_t |
|
40 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, |
|
41 int32_t result, const UChar *s) { |
|
42 UChar32 c; |
|
43 int32_t length; |
|
44 |
|
45 /* decode the result */ |
|
46 if(result<0) { |
|
47 /* (not) original code point */ |
|
48 c=~result; |
|
49 length=-1; |
|
50 } else if(result<=UCASE_MAX_STRING_LENGTH) { |
|
51 c=U_SENTINEL; |
|
52 length=result; |
|
53 } else { |
|
54 c=result; |
|
55 length=-1; |
|
56 } |
|
57 |
|
58 if(destIndex<destCapacity) { |
|
59 /* append the result */ |
|
60 if(length<0) { |
|
61 /* code point */ |
|
62 UBool isError=FALSE; |
|
63 U16_APPEND(dest, destIndex, destCapacity, c, isError); |
|
64 if(isError) { |
|
65 /* overflow, nothing written */ |
|
66 destIndex+=U16_LENGTH(c); |
|
67 } |
|
68 } else { |
|
69 /* string */ |
|
70 if((destIndex+length)<=destCapacity) { |
|
71 while(length>0) { |
|
72 dest[destIndex++]=*s++; |
|
73 --length; |
|
74 } |
|
75 } else { |
|
76 /* overflow */ |
|
77 destIndex+=length; |
|
78 } |
|
79 } |
|
80 } else { |
|
81 /* preflight */ |
|
82 if(length<0) { |
|
83 destIndex+=U16_LENGTH(c); |
|
84 } else { |
|
85 destIndex+=length; |
|
86 } |
|
87 } |
|
88 return destIndex; |
|
89 } |
|
90 |
|
91 static UChar32 U_CALLCONV |
|
92 utf16_caseContextIterator(void *context, int8_t dir) { |
|
93 UCaseContext *csc=(UCaseContext *)context; |
|
94 UChar32 c; |
|
95 |
|
96 if(dir<0) { |
|
97 /* reset for backward iteration */ |
|
98 csc->index=csc->cpStart; |
|
99 csc->dir=dir; |
|
100 } else if(dir>0) { |
|
101 /* reset for forward iteration */ |
|
102 csc->index=csc->cpLimit; |
|
103 csc->dir=dir; |
|
104 } else { |
|
105 /* continue current iteration direction */ |
|
106 dir=csc->dir; |
|
107 } |
|
108 |
|
109 if(dir<0) { |
|
110 if(csc->start<csc->index) { |
|
111 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); |
|
112 return c; |
|
113 } |
|
114 } else { |
|
115 if(csc->index<csc->limit) { |
|
116 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); |
|
117 return c; |
|
118 } |
|
119 } |
|
120 return U_SENTINEL; |
|
121 } |
|
122 |
|
123 /* |
|
124 * Case-maps [srcStart..srcLimit[ but takes |
|
125 * context [0..srcLength[ into account. |
|
126 */ |
|
127 static int32_t |
|
128 _caseMap(const UCaseMap *csm, UCaseMapFull *map, |
|
129 UChar *dest, int32_t destCapacity, |
|
130 const UChar *src, UCaseContext *csc, |
|
131 int32_t srcStart, int32_t srcLimit, |
|
132 UErrorCode *pErrorCode) { |
|
133 const UChar *s; |
|
134 UChar32 c, c2 = 0; |
|
135 int32_t srcIndex, destIndex; |
|
136 int32_t locCache; |
|
137 |
|
138 locCache=csm->locCache; |
|
139 |
|
140 /* case mapping loop */ |
|
141 srcIndex=srcStart; |
|
142 destIndex=0; |
|
143 while(srcIndex<srcLimit) { |
|
144 csc->cpStart=srcIndex; |
|
145 U16_NEXT(src, srcIndex, srcLimit, c); |
|
146 csc->cpLimit=srcIndex; |
|
147 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); |
|
148 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { |
|
149 /* fast path version of appendResult() for BMP results */ |
|
150 dest[destIndex++]=(UChar)c2; |
|
151 } else { |
|
152 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
|
153 } |
|
154 } |
|
155 |
|
156 if(destIndex>destCapacity) { |
|
157 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
158 } |
|
159 return destIndex; |
|
160 } |
|
161 |
|
162 #if !UCONFIG_NO_BREAK_ITERATION |
|
163 |
|
164 U_CFUNC int32_t U_CALLCONV |
|
165 ustrcase_internalToTitle(const UCaseMap *csm, |
|
166 UChar *dest, int32_t destCapacity, |
|
167 const UChar *src, int32_t srcLength, |
|
168 UErrorCode *pErrorCode) { |
|
169 const UChar *s; |
|
170 UChar32 c; |
|
171 int32_t prev, titleStart, titleLimit, idx, destIndex, length; |
|
172 UBool isFirstIndex; |
|
173 |
|
174 if(U_FAILURE(*pErrorCode)) { |
|
175 return 0; |
|
176 } |
|
177 |
|
178 // Use the C++ abstract base class to minimize dependencies. |
|
179 // TODO: Change UCaseMap.iter to store a BreakIterator directly. |
|
180 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); |
|
181 |
|
182 /* set up local variables */ |
|
183 int32_t locCache=csm->locCache; |
|
184 UCaseContext csc=UCASECONTEXT_INITIALIZER; |
|
185 csc.p=(void *)src; |
|
186 csc.limit=srcLength; |
|
187 destIndex=0; |
|
188 prev=0; |
|
189 isFirstIndex=TRUE; |
|
190 |
|
191 /* titlecasing loop */ |
|
192 while(prev<srcLength) { |
|
193 /* find next index where to titlecase */ |
|
194 if(isFirstIndex) { |
|
195 isFirstIndex=FALSE; |
|
196 idx=bi->first(); |
|
197 } else { |
|
198 idx=bi->next(); |
|
199 } |
|
200 if(idx==UBRK_DONE || idx>srcLength) { |
|
201 idx=srcLength; |
|
202 } |
|
203 |
|
204 /* |
|
205 * Unicode 4 & 5 section 3.13 Default Case Operations: |
|
206 * |
|
207 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex |
|
208 * #29, "Text Boundaries." Between each pair of word boundaries, find the first |
|
209 * cased character F. If F exists, map F to default_title(F); then map each |
|
210 * subsequent character C to default_lower(C). |
|
211 * |
|
212 * In this implementation, segment [prev..index[ into 3 parts: |
|
213 * a) uncased characters (copy as-is) [prev..titleStart[ |
|
214 * b) first case letter (titlecase) [titleStart..titleLimit[ |
|
215 * c) subsequent characters (lowercase) [titleLimit..index[ |
|
216 */ |
|
217 if(prev<idx) { |
|
218 /* find and copy uncased characters [prev..titleStart[ */ |
|
219 titleStart=titleLimit=prev; |
|
220 U16_NEXT(src, titleLimit, idx, c); |
|
221 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { |
|
222 /* Adjust the titlecasing index (titleStart) to the next cased character. */ |
|
223 for(;;) { |
|
224 titleStart=titleLimit; |
|
225 if(titleLimit==idx) { |
|
226 /* |
|
227 * only uncased characters in [prev..index[ |
|
228 * stop with titleStart==titleLimit==index |
|
229 */ |
|
230 break; |
|
231 } |
|
232 U16_NEXT(src, titleLimit, idx, c); |
|
233 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { |
|
234 break; /* cased letter at [titleStart..titleLimit[ */ |
|
235 } |
|
236 } |
|
237 length=titleStart-prev; |
|
238 if(length>0) { |
|
239 if((destIndex+length)<=destCapacity) { |
|
240 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); |
|
241 } |
|
242 destIndex+=length; |
|
243 } |
|
244 } |
|
245 |
|
246 if(titleStart<titleLimit) { |
|
247 /* titlecase c which is from [titleStart..titleLimit[ */ |
|
248 csc.cpStart=titleStart; |
|
249 csc.cpLimit=titleLimit; |
|
250 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); |
|
251 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
|
252 |
|
253 /* Special case Dutch IJ titlecasing */ |
|
254 if ( titleStart+1 < idx && |
|
255 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH && |
|
256 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && |
|
257 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { |
|
258 c=(UChar32) 0x004A; |
|
259 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
|
260 titleLimit++; |
|
261 } |
|
262 |
|
263 /* lowercase [titleLimit..index[ */ |
|
264 if(titleLimit<idx) { |
|
265 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { |
|
266 /* Normal operation: Lowercase the rest of the word. */ |
|
267 destIndex+= |
|
268 _caseMap( |
|
269 csm, ucase_toFullLower, |
|
270 dest+destIndex, destCapacity-destIndex, |
|
271 src, &csc, |
|
272 titleLimit, idx, |
|
273 pErrorCode); |
|
274 } else { |
|
275 /* Optionally just copy the rest of the word unchanged. */ |
|
276 length=idx-titleLimit; |
|
277 if((destIndex+length)<=destCapacity) { |
|
278 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); |
|
279 } |
|
280 destIndex+=length; |
|
281 } |
|
282 } |
|
283 } |
|
284 } |
|
285 |
|
286 prev=idx; |
|
287 } |
|
288 |
|
289 if(destIndex>destCapacity) { |
|
290 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
291 } |
|
292 return destIndex; |
|
293 } |
|
294 |
|
295 #endif // !UCONFIG_NO_BREAK_ITERATION |
|
296 |
|
297 /* functions available in the common library (for unistr_case.cpp) */ |
|
298 |
|
299 U_CFUNC int32_t U_CALLCONV |
|
300 ustrcase_internalToLower(const UCaseMap *csm, |
|
301 UChar *dest, int32_t destCapacity, |
|
302 const UChar *src, int32_t srcLength, |
|
303 UErrorCode *pErrorCode) { |
|
304 UCaseContext csc=UCASECONTEXT_INITIALIZER; |
|
305 csc.p=(void *)src; |
|
306 csc.limit=srcLength; |
|
307 return _caseMap( |
|
308 csm, ucase_toFullLower, |
|
309 dest, destCapacity, |
|
310 src, &csc, 0, srcLength, |
|
311 pErrorCode); |
|
312 } |
|
313 |
|
314 U_CFUNC int32_t U_CALLCONV |
|
315 ustrcase_internalToUpper(const UCaseMap *csm, |
|
316 UChar *dest, int32_t destCapacity, |
|
317 const UChar *src, int32_t srcLength, |
|
318 UErrorCode *pErrorCode) { |
|
319 UCaseContext csc=UCASECONTEXT_INITIALIZER; |
|
320 csc.p=(void *)src; |
|
321 csc.limit=srcLength; |
|
322 return _caseMap( |
|
323 csm, ucase_toFullUpper, |
|
324 dest, destCapacity, |
|
325 src, &csc, 0, srcLength, |
|
326 pErrorCode); |
|
327 } |
|
328 |
|
329 static int32_t |
|
330 ustr_foldCase(const UCaseProps *csp, |
|
331 UChar *dest, int32_t destCapacity, |
|
332 const UChar *src, int32_t srcLength, |
|
333 uint32_t options, |
|
334 UErrorCode *pErrorCode) { |
|
335 int32_t srcIndex, destIndex; |
|
336 |
|
337 const UChar *s; |
|
338 UChar32 c, c2 = 0; |
|
339 |
|
340 /* case mapping loop */ |
|
341 srcIndex=destIndex=0; |
|
342 while(srcIndex<srcLength) { |
|
343 U16_NEXT(src, srcIndex, srcLength, c); |
|
344 c=ucase_toFullFolding(csp, c, &s, options); |
|
345 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { |
|
346 /* fast path version of appendResult() for BMP results */ |
|
347 dest[destIndex++]=(UChar)c2; |
|
348 } else { |
|
349 destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
|
350 } |
|
351 } |
|
352 |
|
353 if(destIndex>destCapacity) { |
|
354 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
355 } |
|
356 return destIndex; |
|
357 } |
|
358 |
|
359 U_CFUNC int32_t U_CALLCONV |
|
360 ustrcase_internalFold(const UCaseMap *csm, |
|
361 UChar *dest, int32_t destCapacity, |
|
362 const UChar *src, int32_t srcLength, |
|
363 UErrorCode *pErrorCode) { |
|
364 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); |
|
365 } |
|
366 |
|
367 U_CFUNC int32_t |
|
368 ustrcase_map(const UCaseMap *csm, |
|
369 UChar *dest, int32_t destCapacity, |
|
370 const UChar *src, int32_t srcLength, |
|
371 UStringCaseMapper *stringCaseMapper, |
|
372 UErrorCode *pErrorCode) { |
|
373 UChar buffer[300]; |
|
374 UChar *temp; |
|
375 |
|
376 int32_t destLength; |
|
377 |
|
378 /* check argument values */ |
|
379 if(U_FAILURE(*pErrorCode)) { |
|
380 return 0; |
|
381 } |
|
382 if( destCapacity<0 || |
|
383 (dest==NULL && destCapacity>0) || |
|
384 src==NULL || |
|
385 srcLength<-1 |
|
386 ) { |
|
387 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
388 return 0; |
|
389 } |
|
390 |
|
391 /* get the string length */ |
|
392 if(srcLength==-1) { |
|
393 srcLength=u_strlen(src); |
|
394 } |
|
395 |
|
396 /* check for overlapping source and destination */ |
|
397 if( dest!=NULL && |
|
398 ((src>=dest && src<(dest+destCapacity)) || |
|
399 (dest>=src && dest<(src+srcLength))) |
|
400 ) { |
|
401 /* overlap: provide a temporary destination buffer and later copy the result */ |
|
402 if(destCapacity<=LENGTHOF(buffer)) { |
|
403 /* the stack buffer is large enough */ |
|
404 temp=buffer; |
|
405 } else { |
|
406 /* allocate a buffer */ |
|
407 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); |
|
408 if(temp==NULL) { |
|
409 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
410 return 0; |
|
411 } |
|
412 } |
|
413 } else { |
|
414 temp=dest; |
|
415 } |
|
416 |
|
417 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode); |
|
418 if(temp!=dest) { |
|
419 /* copy the result string to the destination buffer */ |
|
420 if(destLength>0) { |
|
421 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; |
|
422 if(copyLength>0) { |
|
423 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); |
|
424 } |
|
425 } |
|
426 if(temp!=buffer) { |
|
427 uprv_free(temp); |
|
428 } |
|
429 } |
|
430 |
|
431 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); |
|
432 } |
|
433 |
|
434 /* public API functions */ |
|
435 |
|
436 U_CAPI int32_t U_EXPORT2 |
|
437 u_strFoldCase(UChar *dest, int32_t destCapacity, |
|
438 const UChar *src, int32_t srcLength, |
|
439 uint32_t options, |
|
440 UErrorCode *pErrorCode) { |
|
441 UCaseMap csm=UCASEMAP_INITIALIZER; |
|
442 csm.csp=ucase_getSingleton(); |
|
443 csm.options=options; |
|
444 return ustrcase_map( |
|
445 &csm, |
|
446 dest, destCapacity, |
|
447 src, srcLength, |
|
448 ustrcase_internalFold, pErrorCode); |
|
449 } |
|
450 |
|
451 /* case-insensitive string comparisons -------------------------------------- */ |
|
452 |
|
453 /* |
|
454 * This function is a copy of unorm_cmpEquivFold() minus the parts for |
|
455 * canonical equivalence. |
|
456 * Keep the functions in sync, and see there for how this works. |
|
457 * The duplication is for modularization: |
|
458 * It makes caseless (but not canonical caseless) matches independent of |
|
459 * the normalization code. |
|
460 */ |
|
461 |
|
462 /* stack element for previous-level source/decomposition pointers */ |
|
463 struct CmpEquivLevel { |
|
464 const UChar *start, *s, *limit; |
|
465 }; |
|
466 typedef struct CmpEquivLevel CmpEquivLevel; |
|
467 |
|
468 /* internal function */ |
|
469 U_CFUNC int32_t |
|
470 u_strcmpFold(const UChar *s1, int32_t length1, |
|
471 const UChar *s2, int32_t length2, |
|
472 uint32_t options, |
|
473 UErrorCode *pErrorCode) { |
|
474 const UCaseProps *csp; |
|
475 |
|
476 /* current-level start/limit - s1/s2 as current */ |
|
477 const UChar *start1, *start2, *limit1, *limit2; |
|
478 |
|
479 /* case folding variables */ |
|
480 const UChar *p; |
|
481 int32_t length; |
|
482 |
|
483 /* stacks of previous-level start/current/limit */ |
|
484 CmpEquivLevel stack1[2], stack2[2]; |
|
485 |
|
486 /* case folding buffers, only use current-level start/limit */ |
|
487 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; |
|
488 |
|
489 /* track which is the current level per string */ |
|
490 int32_t level1, level2; |
|
491 |
|
492 /* current code units, and code points for lookups */ |
|
493 UChar32 c1, c2, cp1, cp2; |
|
494 |
|
495 /* no argument error checking because this itself is not an API */ |
|
496 |
|
497 /* |
|
498 * assume that at least the option U_COMPARE_IGNORE_CASE is set |
|
499 * otherwise this function would have to behave exactly as uprv_strCompare() |
|
500 */ |
|
501 csp=ucase_getSingleton(); |
|
502 if(U_FAILURE(*pErrorCode)) { |
|
503 return 0; |
|
504 } |
|
505 |
|
506 /* initialize */ |
|
507 start1=s1; |
|
508 if(length1==-1) { |
|
509 limit1=NULL; |
|
510 } else { |
|
511 limit1=s1+length1; |
|
512 } |
|
513 |
|
514 start2=s2; |
|
515 if(length2==-1) { |
|
516 limit2=NULL; |
|
517 } else { |
|
518 limit2=s2+length2; |
|
519 } |
|
520 |
|
521 level1=level2=0; |
|
522 c1=c2=-1; |
|
523 |
|
524 /* comparison loop */ |
|
525 for(;;) { |
|
526 /* |
|
527 * here a code unit value of -1 means "get another code unit" |
|
528 * below it will mean "this source is finished" |
|
529 */ |
|
530 |
|
531 if(c1<0) { |
|
532 /* get next code unit from string 1, post-increment */ |
|
533 for(;;) { |
|
534 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { |
|
535 if(level1==0) { |
|
536 c1=-1; |
|
537 break; |
|
538 } |
|
539 } else { |
|
540 ++s1; |
|
541 break; |
|
542 } |
|
543 |
|
544 /* reached end of level buffer, pop one level */ |
|
545 do { |
|
546 --level1; |
|
547 start1=stack1[level1].start; /*Not uninitialized*/ |
|
548 } while(start1==NULL); |
|
549 s1=stack1[level1].s; /*Not uninitialized*/ |
|
550 limit1=stack1[level1].limit; /*Not uninitialized*/ |
|
551 } |
|
552 } |
|
553 |
|
554 if(c2<0) { |
|
555 /* get next code unit from string 2, post-increment */ |
|
556 for(;;) { |
|
557 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { |
|
558 if(level2==0) { |
|
559 c2=-1; |
|
560 break; |
|
561 } |
|
562 } else { |
|
563 ++s2; |
|
564 break; |
|
565 } |
|
566 |
|
567 /* reached end of level buffer, pop one level */ |
|
568 do { |
|
569 --level2; |
|
570 start2=stack2[level2].start; /*Not uninitialized*/ |
|
571 } while(start2==NULL); |
|
572 s2=stack2[level2].s; /*Not uninitialized*/ |
|
573 limit2=stack2[level2].limit; /*Not uninitialized*/ |
|
574 } |
|
575 } |
|
576 |
|
577 /* |
|
578 * compare c1 and c2 |
|
579 * either variable c1, c2 is -1 only if the corresponding string is finished |
|
580 */ |
|
581 if(c1==c2) { |
|
582 if(c1<0) { |
|
583 return 0; /* c1==c2==-1 indicating end of strings */ |
|
584 } |
|
585 c1=c2=-1; /* make us fetch new code units */ |
|
586 continue; |
|
587 } else if(c1<0) { |
|
588 return -1; /* string 1 ends before string 2 */ |
|
589 } else if(c2<0) { |
|
590 return 1; /* string 2 ends before string 1 */ |
|
591 } |
|
592 /* c1!=c2 && c1>=0 && c2>=0 */ |
|
593 |
|
594 /* get complete code points for c1, c2 for lookups if either is a surrogate */ |
|
595 cp1=c1; |
|
596 if(U_IS_SURROGATE(c1)) { |
|
597 UChar c; |
|
598 |
|
599 if(U_IS_SURROGATE_LEAD(c1)) { |
|
600 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { |
|
601 /* advance ++s1; only below if cp1 decomposes/case-folds */ |
|
602 cp1=U16_GET_SUPPLEMENTARY(c1, c); |
|
603 } |
|
604 } else /* isTrail(c1) */ { |
|
605 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { |
|
606 cp1=U16_GET_SUPPLEMENTARY(c, c1); |
|
607 } |
|
608 } |
|
609 } |
|
610 |
|
611 cp2=c2; |
|
612 if(U_IS_SURROGATE(c2)) { |
|
613 UChar c; |
|
614 |
|
615 if(U_IS_SURROGATE_LEAD(c2)) { |
|
616 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { |
|
617 /* advance ++s2; only below if cp2 decomposes/case-folds */ |
|
618 cp2=U16_GET_SUPPLEMENTARY(c2, c); |
|
619 } |
|
620 } else /* isTrail(c2) */ { |
|
621 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { |
|
622 cp2=U16_GET_SUPPLEMENTARY(c, c2); |
|
623 } |
|
624 } |
|
625 } |
|
626 |
|
627 /* |
|
628 * go down one level for each string |
|
629 * continue with the main loop as soon as there is a real change |
|
630 */ |
|
631 |
|
632 if( level1==0 && |
|
633 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 |
|
634 ) { |
|
635 /* cp1 case-folds to the code point "length" or to p[length] */ |
|
636 if(U_IS_SURROGATE(c1)) { |
|
637 if(U_IS_SURROGATE_LEAD(c1)) { |
|
638 /* advance beyond source surrogate pair if it case-folds */ |
|
639 ++s1; |
|
640 } else /* isTrail(c1) */ { |
|
641 /* |
|
642 * we got a supplementary code point when hitting its trail surrogate, |
|
643 * therefore the lead surrogate must have been the same as in the other string; |
|
644 * compare this decomposition with the lead surrogate in the other string |
|
645 * remember that this simulates bulk text replacement: |
|
646 * the decomposition would replace the entire code point |
|
647 */ |
|
648 --s2; |
|
649 c2=*(s2-1); |
|
650 } |
|
651 } |
|
652 |
|
653 /* push current level pointers */ |
|
654 stack1[0].start=start1; |
|
655 stack1[0].s=s1; |
|
656 stack1[0].limit=limit1; |
|
657 ++level1; |
|
658 |
|
659 /* copy the folding result to fold1[] */ |
|
660 if(length<=UCASE_MAX_STRING_LENGTH) { |
|
661 u_memcpy(fold1, p, length); |
|
662 } else { |
|
663 int32_t i=0; |
|
664 U16_APPEND_UNSAFE(fold1, i, length); |
|
665 length=i; |
|
666 } |
|
667 |
|
668 /* set next level pointers to case folding */ |
|
669 start1=s1=fold1; |
|
670 limit1=fold1+length; |
|
671 |
|
672 /* get ready to read from decomposition, continue with loop */ |
|
673 c1=-1; |
|
674 continue; |
|
675 } |
|
676 |
|
677 if( level2==0 && |
|
678 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 |
|
679 ) { |
|
680 /* cp2 case-folds to the code point "length" or to p[length] */ |
|
681 if(U_IS_SURROGATE(c2)) { |
|
682 if(U_IS_SURROGATE_LEAD(c2)) { |
|
683 /* advance beyond source surrogate pair if it case-folds */ |
|
684 ++s2; |
|
685 } else /* isTrail(c2) */ { |
|
686 /* |
|
687 * we got a supplementary code point when hitting its trail surrogate, |
|
688 * therefore the lead surrogate must have been the same as in the other string; |
|
689 * compare this decomposition with the lead surrogate in the other string |
|
690 * remember that this simulates bulk text replacement: |
|
691 * the decomposition would replace the entire code point |
|
692 */ |
|
693 --s1; |
|
694 c1=*(s1-1); |
|
695 } |
|
696 } |
|
697 |
|
698 /* push current level pointers */ |
|
699 stack2[0].start=start2; |
|
700 stack2[0].s=s2; |
|
701 stack2[0].limit=limit2; |
|
702 ++level2; |
|
703 |
|
704 /* copy the folding result to fold2[] */ |
|
705 if(length<=UCASE_MAX_STRING_LENGTH) { |
|
706 u_memcpy(fold2, p, length); |
|
707 } else { |
|
708 int32_t i=0; |
|
709 U16_APPEND_UNSAFE(fold2, i, length); |
|
710 length=i; |
|
711 } |
|
712 |
|
713 /* set next level pointers to case folding */ |
|
714 start2=s2=fold2; |
|
715 limit2=fold2+length; |
|
716 |
|
717 /* get ready to read from decomposition, continue with loop */ |
|
718 c2=-1; |
|
719 continue; |
|
720 } |
|
721 |
|
722 /* |
|
723 * no decomposition/case folding, max level for both sides: |
|
724 * return difference result |
|
725 * |
|
726 * code point order comparison must not just return cp1-cp2 |
|
727 * because when single surrogates are present then the surrogate pairs |
|
728 * that formed cp1 and cp2 may be from different string indexes |
|
729 * |
|
730 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units |
|
731 * c1=d800 cp1=10001 c2=dc00 cp2=10000 |
|
732 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } |
|
733 * |
|
734 * therefore, use same fix-up as in ustring.c/uprv_strCompare() |
|
735 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ |
|
736 * so we have slightly different pointer/start/limit comparisons here |
|
737 */ |
|
738 |
|
739 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { |
|
740 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
|
741 if( |
|
742 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || |
|
743 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) |
|
744 ) { |
|
745 /* part of a surrogate pair, leave >=d800 */ |
|
746 } else { |
|
747 /* BMP code point - may be surrogate code point - make <d800 */ |
|
748 c1-=0x2800; |
|
749 } |
|
750 |
|
751 if( |
|
752 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || |
|
753 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) |
|
754 ) { |
|
755 /* part of a surrogate pair, leave >=d800 */ |
|
756 } else { |
|
757 /* BMP code point - may be surrogate code point - make <d800 */ |
|
758 c2-=0x2800; |
|
759 } |
|
760 } |
|
761 |
|
762 return c1-c2; |
|
763 } |
|
764 } |
|
765 |
|
766 /* public API functions */ |
|
767 |
|
768 U_CAPI int32_t U_EXPORT2 |
|
769 u_strCaseCompare(const UChar *s1, int32_t length1, |
|
770 const UChar *s2, int32_t length2, |
|
771 uint32_t options, |
|
772 UErrorCode *pErrorCode) { |
|
773 /* argument checking */ |
|
774 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { |
|
775 return 0; |
|
776 } |
|
777 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { |
|
778 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
779 return 0; |
|
780 } |
|
781 return u_strcmpFold(s1, length1, s2, length2, |
|
782 options|U_COMPARE_IGNORE_CASE, |
|
783 pErrorCode); |
|
784 } |
|
785 |
|
786 U_CAPI int32_t U_EXPORT2 |
|
787 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { |
|
788 UErrorCode errorCode=U_ZERO_ERROR; |
|
789 return u_strcmpFold(s1, -1, s2, -1, |
|
790 options|U_COMPARE_IGNORE_CASE, |
|
791 &errorCode); |
|
792 } |
|
793 |
|
794 U_CAPI int32_t U_EXPORT2 |
|
795 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { |
|
796 UErrorCode errorCode=U_ZERO_ERROR; |
|
797 return u_strcmpFold(s1, length, s2, length, |
|
798 options|U_COMPARE_IGNORE_CASE, |
|
799 &errorCode); |
|
800 } |
|
801 |
|
802 U_CAPI int32_t U_EXPORT2 |
|
803 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { |
|
804 UErrorCode errorCode=U_ZERO_ERROR; |
|
805 return u_strcmpFold(s1, n, s2, n, |
|
806 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), |
|
807 &errorCode); |
|
808 } |