intl/icu/source/common/ustrcase.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ustrcase.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2002feb20
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * Implementation file for string casing C API functions.
michael@0 17 * Uses functions from uchar.c for basic functionality that requires access
michael@0 18 * to the Unicode Character Database (uprops.dat).
michael@0 19 */
michael@0 20
michael@0 21 #include "unicode/utypes.h"
michael@0 22 #include "unicode/brkiter.h"
michael@0 23 #include "unicode/ustring.h"
michael@0 24 #include "unicode/ucasemap.h"
michael@0 25 #include "unicode/ubrk.h"
michael@0 26 #include "unicode/utf.h"
michael@0 27 #include "unicode/utf16.h"
michael@0 28 #include "cmemory.h"
michael@0 29 #include "ucase.h"
michael@0 30 #include "ustr_imp.h"
michael@0 31
michael@0 32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 33
michael@0 34 U_NAMESPACE_USE
michael@0 35
michael@0 36 /* string casing ------------------------------------------------------------ */
michael@0 37
michael@0 38 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
michael@0 39 static inline int32_t
michael@0 40 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
michael@0 41 int32_t result, const UChar *s) {
michael@0 42 UChar32 c;
michael@0 43 int32_t length;
michael@0 44
michael@0 45 /* decode the result */
michael@0 46 if(result<0) {
michael@0 47 /* (not) original code point */
michael@0 48 c=~result;
michael@0 49 length=-1;
michael@0 50 } else if(result<=UCASE_MAX_STRING_LENGTH) {
michael@0 51 c=U_SENTINEL;
michael@0 52 length=result;
michael@0 53 } else {
michael@0 54 c=result;
michael@0 55 length=-1;
michael@0 56 }
michael@0 57
michael@0 58 if(destIndex<destCapacity) {
michael@0 59 /* append the result */
michael@0 60 if(length<0) {
michael@0 61 /* code point */
michael@0 62 UBool isError=FALSE;
michael@0 63 U16_APPEND(dest, destIndex, destCapacity, c, isError);
michael@0 64 if(isError) {
michael@0 65 /* overflow, nothing written */
michael@0 66 destIndex+=U16_LENGTH(c);
michael@0 67 }
michael@0 68 } else {
michael@0 69 /* string */
michael@0 70 if((destIndex+length)<=destCapacity) {
michael@0 71 while(length>0) {
michael@0 72 dest[destIndex++]=*s++;
michael@0 73 --length;
michael@0 74 }
michael@0 75 } else {
michael@0 76 /* overflow */
michael@0 77 destIndex+=length;
michael@0 78 }
michael@0 79 }
michael@0 80 } else {
michael@0 81 /* preflight */
michael@0 82 if(length<0) {
michael@0 83 destIndex+=U16_LENGTH(c);
michael@0 84 } else {
michael@0 85 destIndex+=length;
michael@0 86 }
michael@0 87 }
michael@0 88 return destIndex;
michael@0 89 }
michael@0 90
michael@0 91 static UChar32 U_CALLCONV
michael@0 92 utf16_caseContextIterator(void *context, int8_t dir) {
michael@0 93 UCaseContext *csc=(UCaseContext *)context;
michael@0 94 UChar32 c;
michael@0 95
michael@0 96 if(dir<0) {
michael@0 97 /* reset for backward iteration */
michael@0 98 csc->index=csc->cpStart;
michael@0 99 csc->dir=dir;
michael@0 100 } else if(dir>0) {
michael@0 101 /* reset for forward iteration */
michael@0 102 csc->index=csc->cpLimit;
michael@0 103 csc->dir=dir;
michael@0 104 } else {
michael@0 105 /* continue current iteration direction */
michael@0 106 dir=csc->dir;
michael@0 107 }
michael@0 108
michael@0 109 if(dir<0) {
michael@0 110 if(csc->start<csc->index) {
michael@0 111 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
michael@0 112 return c;
michael@0 113 }
michael@0 114 } else {
michael@0 115 if(csc->index<csc->limit) {
michael@0 116 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
michael@0 117 return c;
michael@0 118 }
michael@0 119 }
michael@0 120 return U_SENTINEL;
michael@0 121 }
michael@0 122
michael@0 123 /*
michael@0 124 * Case-maps [srcStart..srcLimit[ but takes
michael@0 125 * context [0..srcLength[ into account.
michael@0 126 */
michael@0 127 static int32_t
michael@0 128 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
michael@0 129 UChar *dest, int32_t destCapacity,
michael@0 130 const UChar *src, UCaseContext *csc,
michael@0 131 int32_t srcStart, int32_t srcLimit,
michael@0 132 UErrorCode *pErrorCode) {
michael@0 133 const UChar *s;
michael@0 134 UChar32 c, c2 = 0;
michael@0 135 int32_t srcIndex, destIndex;
michael@0 136 int32_t locCache;
michael@0 137
michael@0 138 locCache=csm->locCache;
michael@0 139
michael@0 140 /* case mapping loop */
michael@0 141 srcIndex=srcStart;
michael@0 142 destIndex=0;
michael@0 143 while(srcIndex<srcLimit) {
michael@0 144 csc->cpStart=srcIndex;
michael@0 145 U16_NEXT(src, srcIndex, srcLimit, c);
michael@0 146 csc->cpLimit=srcIndex;
michael@0 147 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
michael@0 148 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
michael@0 149 /* fast path version of appendResult() for BMP results */
michael@0 150 dest[destIndex++]=(UChar)c2;
michael@0 151 } else {
michael@0 152 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
michael@0 153 }
michael@0 154 }
michael@0 155
michael@0 156 if(destIndex>destCapacity) {
michael@0 157 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 158 }
michael@0 159 return destIndex;
michael@0 160 }
michael@0 161
michael@0 162 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 163
michael@0 164 U_CFUNC int32_t U_CALLCONV
michael@0 165 ustrcase_internalToTitle(const UCaseMap *csm,
michael@0 166 UChar *dest, int32_t destCapacity,
michael@0 167 const UChar *src, int32_t srcLength,
michael@0 168 UErrorCode *pErrorCode) {
michael@0 169 const UChar *s;
michael@0 170 UChar32 c;
michael@0 171 int32_t prev, titleStart, titleLimit, idx, destIndex, length;
michael@0 172 UBool isFirstIndex;
michael@0 173
michael@0 174 if(U_FAILURE(*pErrorCode)) {
michael@0 175 return 0;
michael@0 176 }
michael@0 177
michael@0 178 // Use the C++ abstract base class to minimize dependencies.
michael@0 179 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
michael@0 180 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
michael@0 181
michael@0 182 /* set up local variables */
michael@0 183 int32_t locCache=csm->locCache;
michael@0 184 UCaseContext csc=UCASECONTEXT_INITIALIZER;
michael@0 185 csc.p=(void *)src;
michael@0 186 csc.limit=srcLength;
michael@0 187 destIndex=0;
michael@0 188 prev=0;
michael@0 189 isFirstIndex=TRUE;
michael@0 190
michael@0 191 /* titlecasing loop */
michael@0 192 while(prev<srcLength) {
michael@0 193 /* find next index where to titlecase */
michael@0 194 if(isFirstIndex) {
michael@0 195 isFirstIndex=FALSE;
michael@0 196 idx=bi->first();
michael@0 197 } else {
michael@0 198 idx=bi->next();
michael@0 199 }
michael@0 200 if(idx==UBRK_DONE || idx>srcLength) {
michael@0 201 idx=srcLength;
michael@0 202 }
michael@0 203
michael@0 204 /*
michael@0 205 * Unicode 4 & 5 section 3.13 Default Case Operations:
michael@0 206 *
michael@0 207 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
michael@0 208 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
michael@0 209 * cased character F. If F exists, map F to default_title(F); then map each
michael@0 210 * subsequent character C to default_lower(C).
michael@0 211 *
michael@0 212 * In this implementation, segment [prev..index[ into 3 parts:
michael@0 213 * a) uncased characters (copy as-is) [prev..titleStart[
michael@0 214 * b) first case letter (titlecase) [titleStart..titleLimit[
michael@0 215 * c) subsequent characters (lowercase) [titleLimit..index[
michael@0 216 */
michael@0 217 if(prev<idx) {
michael@0 218 /* find and copy uncased characters [prev..titleStart[ */
michael@0 219 titleStart=titleLimit=prev;
michael@0 220 U16_NEXT(src, titleLimit, idx, c);
michael@0 221 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
michael@0 222 /* Adjust the titlecasing index (titleStart) to the next cased character. */
michael@0 223 for(;;) {
michael@0 224 titleStart=titleLimit;
michael@0 225 if(titleLimit==idx) {
michael@0 226 /*
michael@0 227 * only uncased characters in [prev..index[
michael@0 228 * stop with titleStart==titleLimit==index
michael@0 229 */
michael@0 230 break;
michael@0 231 }
michael@0 232 U16_NEXT(src, titleLimit, idx, c);
michael@0 233 if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
michael@0 234 break; /* cased letter at [titleStart..titleLimit[ */
michael@0 235 }
michael@0 236 }
michael@0 237 length=titleStart-prev;
michael@0 238 if(length>0) {
michael@0 239 if((destIndex+length)<=destCapacity) {
michael@0 240 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
michael@0 241 }
michael@0 242 destIndex+=length;
michael@0 243 }
michael@0 244 }
michael@0 245
michael@0 246 if(titleStart<titleLimit) {
michael@0 247 /* titlecase c which is from [titleStart..titleLimit[ */
michael@0 248 csc.cpStart=titleStart;
michael@0 249 csc.cpLimit=titleLimit;
michael@0 250 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
michael@0 251 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
michael@0 252
michael@0 253 /* Special case Dutch IJ titlecasing */
michael@0 254 if ( titleStart+1 < idx &&
michael@0 255 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
michael@0 256 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
michael@0 257 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) {
michael@0 258 c=(UChar32) 0x004A;
michael@0 259 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
michael@0 260 titleLimit++;
michael@0 261 }
michael@0 262
michael@0 263 /* lowercase [titleLimit..index[ */
michael@0 264 if(titleLimit<idx) {
michael@0 265 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
michael@0 266 /* Normal operation: Lowercase the rest of the word. */
michael@0 267 destIndex+=
michael@0 268 _caseMap(
michael@0 269 csm, ucase_toFullLower,
michael@0 270 dest+destIndex, destCapacity-destIndex,
michael@0 271 src, &csc,
michael@0 272 titleLimit, idx,
michael@0 273 pErrorCode);
michael@0 274 } else {
michael@0 275 /* Optionally just copy the rest of the word unchanged. */
michael@0 276 length=idx-titleLimit;
michael@0 277 if((destIndex+length)<=destCapacity) {
michael@0 278 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
michael@0 279 }
michael@0 280 destIndex+=length;
michael@0 281 }
michael@0 282 }
michael@0 283 }
michael@0 284 }
michael@0 285
michael@0 286 prev=idx;
michael@0 287 }
michael@0 288
michael@0 289 if(destIndex>destCapacity) {
michael@0 290 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 291 }
michael@0 292 return destIndex;
michael@0 293 }
michael@0 294
michael@0 295 #endif // !UCONFIG_NO_BREAK_ITERATION
michael@0 296
michael@0 297 /* functions available in the common library (for unistr_case.cpp) */
michael@0 298
michael@0 299 U_CFUNC int32_t U_CALLCONV
michael@0 300 ustrcase_internalToLower(const UCaseMap *csm,
michael@0 301 UChar *dest, int32_t destCapacity,
michael@0 302 const UChar *src, int32_t srcLength,
michael@0 303 UErrorCode *pErrorCode) {
michael@0 304 UCaseContext csc=UCASECONTEXT_INITIALIZER;
michael@0 305 csc.p=(void *)src;
michael@0 306 csc.limit=srcLength;
michael@0 307 return _caseMap(
michael@0 308 csm, ucase_toFullLower,
michael@0 309 dest, destCapacity,
michael@0 310 src, &csc, 0, srcLength,
michael@0 311 pErrorCode);
michael@0 312 }
michael@0 313
michael@0 314 U_CFUNC int32_t U_CALLCONV
michael@0 315 ustrcase_internalToUpper(const UCaseMap *csm,
michael@0 316 UChar *dest, int32_t destCapacity,
michael@0 317 const UChar *src, int32_t srcLength,
michael@0 318 UErrorCode *pErrorCode) {
michael@0 319 UCaseContext csc=UCASECONTEXT_INITIALIZER;
michael@0 320 csc.p=(void *)src;
michael@0 321 csc.limit=srcLength;
michael@0 322 return _caseMap(
michael@0 323 csm, ucase_toFullUpper,
michael@0 324 dest, destCapacity,
michael@0 325 src, &csc, 0, srcLength,
michael@0 326 pErrorCode);
michael@0 327 }
michael@0 328
michael@0 329 static int32_t
michael@0 330 ustr_foldCase(const UCaseProps *csp,
michael@0 331 UChar *dest, int32_t destCapacity,
michael@0 332 const UChar *src, int32_t srcLength,
michael@0 333 uint32_t options,
michael@0 334 UErrorCode *pErrorCode) {
michael@0 335 int32_t srcIndex, destIndex;
michael@0 336
michael@0 337 const UChar *s;
michael@0 338 UChar32 c, c2 = 0;
michael@0 339
michael@0 340 /* case mapping loop */
michael@0 341 srcIndex=destIndex=0;
michael@0 342 while(srcIndex<srcLength) {
michael@0 343 U16_NEXT(src, srcIndex, srcLength, c);
michael@0 344 c=ucase_toFullFolding(csp, c, &s, options);
michael@0 345 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
michael@0 346 /* fast path version of appendResult() for BMP results */
michael@0 347 dest[destIndex++]=(UChar)c2;
michael@0 348 } else {
michael@0 349 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
michael@0 350 }
michael@0 351 }
michael@0 352
michael@0 353 if(destIndex>destCapacity) {
michael@0 354 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 355 }
michael@0 356 return destIndex;
michael@0 357 }
michael@0 358
michael@0 359 U_CFUNC int32_t U_CALLCONV
michael@0 360 ustrcase_internalFold(const UCaseMap *csm,
michael@0 361 UChar *dest, int32_t destCapacity,
michael@0 362 const UChar *src, int32_t srcLength,
michael@0 363 UErrorCode *pErrorCode) {
michael@0 364 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
michael@0 365 }
michael@0 366
michael@0 367 U_CFUNC int32_t
michael@0 368 ustrcase_map(const UCaseMap *csm,
michael@0 369 UChar *dest, int32_t destCapacity,
michael@0 370 const UChar *src, int32_t srcLength,
michael@0 371 UStringCaseMapper *stringCaseMapper,
michael@0 372 UErrorCode *pErrorCode) {
michael@0 373 UChar buffer[300];
michael@0 374 UChar *temp;
michael@0 375
michael@0 376 int32_t destLength;
michael@0 377
michael@0 378 /* check argument values */
michael@0 379 if(U_FAILURE(*pErrorCode)) {
michael@0 380 return 0;
michael@0 381 }
michael@0 382 if( destCapacity<0 ||
michael@0 383 (dest==NULL && destCapacity>0) ||
michael@0 384 src==NULL ||
michael@0 385 srcLength<-1
michael@0 386 ) {
michael@0 387 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 388 return 0;
michael@0 389 }
michael@0 390
michael@0 391 /* get the string length */
michael@0 392 if(srcLength==-1) {
michael@0 393 srcLength=u_strlen(src);
michael@0 394 }
michael@0 395
michael@0 396 /* check for overlapping source and destination */
michael@0 397 if( dest!=NULL &&
michael@0 398 ((src>=dest && src<(dest+destCapacity)) ||
michael@0 399 (dest>=src && dest<(src+srcLength)))
michael@0 400 ) {
michael@0 401 /* overlap: provide a temporary destination buffer and later copy the result */
michael@0 402 if(destCapacity<=LENGTHOF(buffer)) {
michael@0 403 /* the stack buffer is large enough */
michael@0 404 temp=buffer;
michael@0 405 } else {
michael@0 406 /* allocate a buffer */
michael@0 407 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
michael@0 408 if(temp==NULL) {
michael@0 409 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 410 return 0;
michael@0 411 }
michael@0 412 }
michael@0 413 } else {
michael@0 414 temp=dest;
michael@0 415 }
michael@0 416
michael@0 417 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
michael@0 418 if(temp!=dest) {
michael@0 419 /* copy the result string to the destination buffer */
michael@0 420 if(destLength>0) {
michael@0 421 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
michael@0 422 if(copyLength>0) {
michael@0 423 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
michael@0 424 }
michael@0 425 }
michael@0 426 if(temp!=buffer) {
michael@0 427 uprv_free(temp);
michael@0 428 }
michael@0 429 }
michael@0 430
michael@0 431 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
michael@0 432 }
michael@0 433
michael@0 434 /* public API functions */
michael@0 435
michael@0 436 U_CAPI int32_t U_EXPORT2
michael@0 437 u_strFoldCase(UChar *dest, int32_t destCapacity,
michael@0 438 const UChar *src, int32_t srcLength,
michael@0 439 uint32_t options,
michael@0 440 UErrorCode *pErrorCode) {
michael@0 441 UCaseMap csm=UCASEMAP_INITIALIZER;
michael@0 442 csm.csp=ucase_getSingleton();
michael@0 443 csm.options=options;
michael@0 444 return ustrcase_map(
michael@0 445 &csm,
michael@0 446 dest, destCapacity,
michael@0 447 src, srcLength,
michael@0 448 ustrcase_internalFold, pErrorCode);
michael@0 449 }
michael@0 450
michael@0 451 /* case-insensitive string comparisons -------------------------------------- */
michael@0 452
michael@0 453 /*
michael@0 454 * This function is a copy of unorm_cmpEquivFold() minus the parts for
michael@0 455 * canonical equivalence.
michael@0 456 * Keep the functions in sync, and see there for how this works.
michael@0 457 * The duplication is for modularization:
michael@0 458 * It makes caseless (but not canonical caseless) matches independent of
michael@0 459 * the normalization code.
michael@0 460 */
michael@0 461
michael@0 462 /* stack element for previous-level source/decomposition pointers */
michael@0 463 struct CmpEquivLevel {
michael@0 464 const UChar *start, *s, *limit;
michael@0 465 };
michael@0 466 typedef struct CmpEquivLevel CmpEquivLevel;
michael@0 467
michael@0 468 /* internal function */
michael@0 469 U_CFUNC int32_t
michael@0 470 u_strcmpFold(const UChar *s1, int32_t length1,
michael@0 471 const UChar *s2, int32_t length2,
michael@0 472 uint32_t options,
michael@0 473 UErrorCode *pErrorCode) {
michael@0 474 const UCaseProps *csp;
michael@0 475
michael@0 476 /* current-level start/limit - s1/s2 as current */
michael@0 477 const UChar *start1, *start2, *limit1, *limit2;
michael@0 478
michael@0 479 /* case folding variables */
michael@0 480 const UChar *p;
michael@0 481 int32_t length;
michael@0 482
michael@0 483 /* stacks of previous-level start/current/limit */
michael@0 484 CmpEquivLevel stack1[2], stack2[2];
michael@0 485
michael@0 486 /* case folding buffers, only use current-level start/limit */
michael@0 487 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
michael@0 488
michael@0 489 /* track which is the current level per string */
michael@0 490 int32_t level1, level2;
michael@0 491
michael@0 492 /* current code units, and code points for lookups */
michael@0 493 UChar32 c1, c2, cp1, cp2;
michael@0 494
michael@0 495 /* no argument error checking because this itself is not an API */
michael@0 496
michael@0 497 /*
michael@0 498 * assume that at least the option U_COMPARE_IGNORE_CASE is set
michael@0 499 * otherwise this function would have to behave exactly as uprv_strCompare()
michael@0 500 */
michael@0 501 csp=ucase_getSingleton();
michael@0 502 if(U_FAILURE(*pErrorCode)) {
michael@0 503 return 0;
michael@0 504 }
michael@0 505
michael@0 506 /* initialize */
michael@0 507 start1=s1;
michael@0 508 if(length1==-1) {
michael@0 509 limit1=NULL;
michael@0 510 } else {
michael@0 511 limit1=s1+length1;
michael@0 512 }
michael@0 513
michael@0 514 start2=s2;
michael@0 515 if(length2==-1) {
michael@0 516 limit2=NULL;
michael@0 517 } else {
michael@0 518 limit2=s2+length2;
michael@0 519 }
michael@0 520
michael@0 521 level1=level2=0;
michael@0 522 c1=c2=-1;
michael@0 523
michael@0 524 /* comparison loop */
michael@0 525 for(;;) {
michael@0 526 /*
michael@0 527 * here a code unit value of -1 means "get another code unit"
michael@0 528 * below it will mean "this source is finished"
michael@0 529 */
michael@0 530
michael@0 531 if(c1<0) {
michael@0 532 /* get next code unit from string 1, post-increment */
michael@0 533 for(;;) {
michael@0 534 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
michael@0 535 if(level1==0) {
michael@0 536 c1=-1;
michael@0 537 break;
michael@0 538 }
michael@0 539 } else {
michael@0 540 ++s1;
michael@0 541 break;
michael@0 542 }
michael@0 543
michael@0 544 /* reached end of level buffer, pop one level */
michael@0 545 do {
michael@0 546 --level1;
michael@0 547 start1=stack1[level1].start; /*Not uninitialized*/
michael@0 548 } while(start1==NULL);
michael@0 549 s1=stack1[level1].s; /*Not uninitialized*/
michael@0 550 limit1=stack1[level1].limit; /*Not uninitialized*/
michael@0 551 }
michael@0 552 }
michael@0 553
michael@0 554 if(c2<0) {
michael@0 555 /* get next code unit from string 2, post-increment */
michael@0 556 for(;;) {
michael@0 557 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
michael@0 558 if(level2==0) {
michael@0 559 c2=-1;
michael@0 560 break;
michael@0 561 }
michael@0 562 } else {
michael@0 563 ++s2;
michael@0 564 break;
michael@0 565 }
michael@0 566
michael@0 567 /* reached end of level buffer, pop one level */
michael@0 568 do {
michael@0 569 --level2;
michael@0 570 start2=stack2[level2].start; /*Not uninitialized*/
michael@0 571 } while(start2==NULL);
michael@0 572 s2=stack2[level2].s; /*Not uninitialized*/
michael@0 573 limit2=stack2[level2].limit; /*Not uninitialized*/
michael@0 574 }
michael@0 575 }
michael@0 576
michael@0 577 /*
michael@0 578 * compare c1 and c2
michael@0 579 * either variable c1, c2 is -1 only if the corresponding string is finished
michael@0 580 */
michael@0 581 if(c1==c2) {
michael@0 582 if(c1<0) {
michael@0 583 return 0; /* c1==c2==-1 indicating end of strings */
michael@0 584 }
michael@0 585 c1=c2=-1; /* make us fetch new code units */
michael@0 586 continue;
michael@0 587 } else if(c1<0) {
michael@0 588 return -1; /* string 1 ends before string 2 */
michael@0 589 } else if(c2<0) {
michael@0 590 return 1; /* string 2 ends before string 1 */
michael@0 591 }
michael@0 592 /* c1!=c2 && c1>=0 && c2>=0 */
michael@0 593
michael@0 594 /* get complete code points for c1, c2 for lookups if either is a surrogate */
michael@0 595 cp1=c1;
michael@0 596 if(U_IS_SURROGATE(c1)) {
michael@0 597 UChar c;
michael@0 598
michael@0 599 if(U_IS_SURROGATE_LEAD(c1)) {
michael@0 600 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
michael@0 601 /* advance ++s1; only below if cp1 decomposes/case-folds */
michael@0 602 cp1=U16_GET_SUPPLEMENTARY(c1, c);
michael@0 603 }
michael@0 604 } else /* isTrail(c1) */ {
michael@0 605 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
michael@0 606 cp1=U16_GET_SUPPLEMENTARY(c, c1);
michael@0 607 }
michael@0 608 }
michael@0 609 }
michael@0 610
michael@0 611 cp2=c2;
michael@0 612 if(U_IS_SURROGATE(c2)) {
michael@0 613 UChar c;
michael@0 614
michael@0 615 if(U_IS_SURROGATE_LEAD(c2)) {
michael@0 616 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
michael@0 617 /* advance ++s2; only below if cp2 decomposes/case-folds */
michael@0 618 cp2=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 619 }
michael@0 620 } else /* isTrail(c2) */ {
michael@0 621 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
michael@0 622 cp2=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 623 }
michael@0 624 }
michael@0 625 }
michael@0 626
michael@0 627 /*
michael@0 628 * go down one level for each string
michael@0 629 * continue with the main loop as soon as there is a real change
michael@0 630 */
michael@0 631
michael@0 632 if( level1==0 &&
michael@0 633 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
michael@0 634 ) {
michael@0 635 /* cp1 case-folds to the code point "length" or to p[length] */
michael@0 636 if(U_IS_SURROGATE(c1)) {
michael@0 637 if(U_IS_SURROGATE_LEAD(c1)) {
michael@0 638 /* advance beyond source surrogate pair if it case-folds */
michael@0 639 ++s1;
michael@0 640 } else /* isTrail(c1) */ {
michael@0 641 /*
michael@0 642 * we got a supplementary code point when hitting its trail surrogate,
michael@0 643 * therefore the lead surrogate must have been the same as in the other string;
michael@0 644 * compare this decomposition with the lead surrogate in the other string
michael@0 645 * remember that this simulates bulk text replacement:
michael@0 646 * the decomposition would replace the entire code point
michael@0 647 */
michael@0 648 --s2;
michael@0 649 c2=*(s2-1);
michael@0 650 }
michael@0 651 }
michael@0 652
michael@0 653 /* push current level pointers */
michael@0 654 stack1[0].start=start1;
michael@0 655 stack1[0].s=s1;
michael@0 656 stack1[0].limit=limit1;
michael@0 657 ++level1;
michael@0 658
michael@0 659 /* copy the folding result to fold1[] */
michael@0 660 if(length<=UCASE_MAX_STRING_LENGTH) {
michael@0 661 u_memcpy(fold1, p, length);
michael@0 662 } else {
michael@0 663 int32_t i=0;
michael@0 664 U16_APPEND_UNSAFE(fold1, i, length);
michael@0 665 length=i;
michael@0 666 }
michael@0 667
michael@0 668 /* set next level pointers to case folding */
michael@0 669 start1=s1=fold1;
michael@0 670 limit1=fold1+length;
michael@0 671
michael@0 672 /* get ready to read from decomposition, continue with loop */
michael@0 673 c1=-1;
michael@0 674 continue;
michael@0 675 }
michael@0 676
michael@0 677 if( level2==0 &&
michael@0 678 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
michael@0 679 ) {
michael@0 680 /* cp2 case-folds to the code point "length" or to p[length] */
michael@0 681 if(U_IS_SURROGATE(c2)) {
michael@0 682 if(U_IS_SURROGATE_LEAD(c2)) {
michael@0 683 /* advance beyond source surrogate pair if it case-folds */
michael@0 684 ++s2;
michael@0 685 } else /* isTrail(c2) */ {
michael@0 686 /*
michael@0 687 * we got a supplementary code point when hitting its trail surrogate,
michael@0 688 * therefore the lead surrogate must have been the same as in the other string;
michael@0 689 * compare this decomposition with the lead surrogate in the other string
michael@0 690 * remember that this simulates bulk text replacement:
michael@0 691 * the decomposition would replace the entire code point
michael@0 692 */
michael@0 693 --s1;
michael@0 694 c1=*(s1-1);
michael@0 695 }
michael@0 696 }
michael@0 697
michael@0 698 /* push current level pointers */
michael@0 699 stack2[0].start=start2;
michael@0 700 stack2[0].s=s2;
michael@0 701 stack2[0].limit=limit2;
michael@0 702 ++level2;
michael@0 703
michael@0 704 /* copy the folding result to fold2[] */
michael@0 705 if(length<=UCASE_MAX_STRING_LENGTH) {
michael@0 706 u_memcpy(fold2, p, length);
michael@0 707 } else {
michael@0 708 int32_t i=0;
michael@0 709 U16_APPEND_UNSAFE(fold2, i, length);
michael@0 710 length=i;
michael@0 711 }
michael@0 712
michael@0 713 /* set next level pointers to case folding */
michael@0 714 start2=s2=fold2;
michael@0 715 limit2=fold2+length;
michael@0 716
michael@0 717 /* get ready to read from decomposition, continue with loop */
michael@0 718 c2=-1;
michael@0 719 continue;
michael@0 720 }
michael@0 721
michael@0 722 /*
michael@0 723 * no decomposition/case folding, max level for both sides:
michael@0 724 * return difference result
michael@0 725 *
michael@0 726 * code point order comparison must not just return cp1-cp2
michael@0 727 * because when single surrogates are present then the surrogate pairs
michael@0 728 * that formed cp1 and cp2 may be from different string indexes
michael@0 729 *
michael@0 730 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
michael@0 731 * c1=d800 cp1=10001 c2=dc00 cp2=10000
michael@0 732 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
michael@0 733 *
michael@0 734 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
michael@0 735 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
michael@0 736 * so we have slightly different pointer/start/limit comparisons here
michael@0 737 */
michael@0 738
michael@0 739 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
michael@0 740 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
michael@0 741 if(
michael@0 742 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
michael@0 743 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
michael@0 744 ) {
michael@0 745 /* part of a surrogate pair, leave >=d800 */
michael@0 746 } else {
michael@0 747 /* BMP code point - may be surrogate code point - make <d800 */
michael@0 748 c1-=0x2800;
michael@0 749 }
michael@0 750
michael@0 751 if(
michael@0 752 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
michael@0 753 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
michael@0 754 ) {
michael@0 755 /* part of a surrogate pair, leave >=d800 */
michael@0 756 } else {
michael@0 757 /* BMP code point - may be surrogate code point - make <d800 */
michael@0 758 c2-=0x2800;
michael@0 759 }
michael@0 760 }
michael@0 761
michael@0 762 return c1-c2;
michael@0 763 }
michael@0 764 }
michael@0 765
michael@0 766 /* public API functions */
michael@0 767
michael@0 768 U_CAPI int32_t U_EXPORT2
michael@0 769 u_strCaseCompare(const UChar *s1, int32_t length1,
michael@0 770 const UChar *s2, int32_t length2,
michael@0 771 uint32_t options,
michael@0 772 UErrorCode *pErrorCode) {
michael@0 773 /* argument checking */
michael@0 774 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
michael@0 775 return 0;
michael@0 776 }
michael@0 777 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
michael@0 778 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 779 return 0;
michael@0 780 }
michael@0 781 return u_strcmpFold(s1, length1, s2, length2,
michael@0 782 options|U_COMPARE_IGNORE_CASE,
michael@0 783 pErrorCode);
michael@0 784 }
michael@0 785
michael@0 786 U_CAPI int32_t U_EXPORT2
michael@0 787 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
michael@0 788 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 789 return u_strcmpFold(s1, -1, s2, -1,
michael@0 790 options|U_COMPARE_IGNORE_CASE,
michael@0 791 &errorCode);
michael@0 792 }
michael@0 793
michael@0 794 U_CAPI int32_t U_EXPORT2
michael@0 795 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
michael@0 796 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 797 return u_strcmpFold(s1, length, s2, length,
michael@0 798 options|U_COMPARE_IGNORE_CASE,
michael@0 799 &errorCode);
michael@0 800 }
michael@0 801
michael@0 802 U_CAPI int32_t U_EXPORT2
michael@0 803 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
michael@0 804 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 805 return u_strcmpFold(s1, n, s2, n,
michael@0 806 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
michael@0 807 &errorCode);
michael@0 808 }

mercurial