intl/icu/source/common/unormcmp.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: unormcmp.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004sep13
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * unorm_compare() function moved here from unorm.cpp for better modularization.
michael@0 17 * Depends on both normalization and case folding.
michael@0 18 * Allows unorm.cpp to not depend on any character properties code.
michael@0 19 */
michael@0 20
michael@0 21 #include "unicode/utypes.h"
michael@0 22
michael@0 23 #if !UCONFIG_NO_NORMALIZATION
michael@0 24
michael@0 25 #include "unicode/unorm.h"
michael@0 26 #include "unicode/ustring.h"
michael@0 27 #include "cmemory.h"
michael@0 28 #include "normalizer2impl.h"
michael@0 29 #include "ucase.h"
michael@0 30 #include "uprops.h"
michael@0 31 #include "ustr_imp.h"
michael@0 32
michael@0 33 U_NAMESPACE_USE
michael@0 34
michael@0 35 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 36
michael@0 37 /* compare canonically equivalent ------------------------------------------- */
michael@0 38
michael@0 39 /*
michael@0 40 * Compare two strings for canonical equivalence.
michael@0 41 * Further options include case-insensitive comparison and
michael@0 42 * code point order (as opposed to code unit order).
michael@0 43 *
michael@0 44 * In this function, canonical equivalence is optional as well.
michael@0 45 * If canonical equivalence is tested, then both strings must fulfill
michael@0 46 * the FCD check.
michael@0 47 *
michael@0 48 * Semantically, this is equivalent to
michael@0 49 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
michael@0 50 * where code point order, NFD and foldCase are all optional.
michael@0 51 *
michael@0 52 * String comparisons almost always yield results before processing both strings
michael@0 53 * completely.
michael@0 54 * They are generally more efficient working incrementally instead of
michael@0 55 * performing the sub-processing (strlen, normalization, case-folding)
michael@0 56 * on the entire strings first.
michael@0 57 *
michael@0 58 * It is also unnecessary to not normalize identical characters.
michael@0 59 *
michael@0 60 * This function works in principle as follows:
michael@0 61 *
michael@0 62 * loop {
michael@0 63 * get one code unit c1 from s1 (-1 if end of source)
michael@0 64 * get one code unit c2 from s2 (-1 if end of source)
michael@0 65 *
michael@0 66 * if(either string finished) {
michael@0 67 * return result;
michael@0 68 * }
michael@0 69 * if(c1==c2) {
michael@0 70 * continue;
michael@0 71 * }
michael@0 72 *
michael@0 73 * // c1!=c2
michael@0 74 * try to decompose/case-fold c1/c2, and continue if one does;
michael@0 75 *
michael@0 76 * // still c1!=c2 and neither decomposes/case-folds, return result
michael@0 77 * return c1-c2;
michael@0 78 * }
michael@0 79 *
michael@0 80 * When a character decomposes, then the pointer for that source changes to
michael@0 81 * the decomposition, pushing the previous pointer onto a stack.
michael@0 82 * When the end of the decomposition is reached, then the code unit reader
michael@0 83 * pops the previous source from the stack.
michael@0 84 * (Same for case-folding.)
michael@0 85 *
michael@0 86 * This is complicated further by operating on variable-width UTF-16.
michael@0 87 * The top part of the loop works on code units, while lookups for decomposition
michael@0 88 * and case-folding need code points.
michael@0 89 * Code points are assembled after the equality/end-of-source part.
michael@0 90 * The source pointer is only advanced beyond all code units when the code point
michael@0 91 * actually decomposes/case-folds.
michael@0 92 *
michael@0 93 * If we were on a trail surrogate unit when assembling a code point,
michael@0 94 * and the code point decomposes/case-folds, then the decomposition/folding
michael@0 95 * result must be compared with the part of the other string that corresponds to
michael@0 96 * this string's lead surrogate.
michael@0 97 * Since we only assemble a code point when hitting a trail unit when the
michael@0 98 * preceding lead units were identical, we back up the other string by one unit
michael@0 99 * in such a case.
michael@0 100 *
michael@0 101 * The optional code point order comparison at the end works with
michael@0 102 * the same fix-up as the other code point order comparison functions.
michael@0 103 * See ustring.c and the comment near the end of this function.
michael@0 104 *
michael@0 105 * Assumption: A decomposition or case-folding result string never contains
michael@0 106 * a single surrogate. This is a safe assumption in the Unicode Standard.
michael@0 107 * Therefore, we do not need to check for surrogate pairs across
michael@0 108 * decomposition/case-folding boundaries.
michael@0 109 *
michael@0 110 * Further assumptions (see verifications tstnorm.cpp):
michael@0 111 * The API function checks for FCD first, while the core function
michael@0 112 * first case-folds and then decomposes. This requires that case-folding does not
michael@0 113 * un-FCD any strings.
michael@0 114 *
michael@0 115 * The API function may also NFD the input and turn off decomposition.
michael@0 116 * This requires that case-folding does not un-NFD strings either.
michael@0 117 *
michael@0 118 * TODO If any of the above two assumptions is violated,
michael@0 119 * then this entire code must be re-thought.
michael@0 120 * If this happens, then a simple solution is to case-fold both strings up front
michael@0 121 * and to turn off UNORM_INPUT_IS_FCD.
michael@0 122 * We already do this when not both strings are in FCD because makeFCD
michael@0 123 * would be a partial NFD before the case folding, which does not work.
michael@0 124 * Note that all of this is only a problem when case-folding _and_
michael@0 125 * canonical equivalence come together.
michael@0 126 * (Comments in unorm_compare() are more up to date than this TODO.)
michael@0 127 */
michael@0 128
michael@0 129 /* stack element for previous-level source/decomposition pointers */
michael@0 130 struct CmpEquivLevel {
michael@0 131 const UChar *start, *s, *limit;
michael@0 132 };
michael@0 133 typedef struct CmpEquivLevel CmpEquivLevel;
michael@0 134
michael@0 135 /**
michael@0 136 * Internal option for unorm_cmpEquivFold() for decomposing.
michael@0 137 * If not set, just do strcasecmp().
michael@0 138 */
michael@0 139 #define _COMPARE_EQUIV 0x80000
michael@0 140
michael@0 141 /* internal function */
michael@0 142 static int32_t
michael@0 143 unorm_cmpEquivFold(const UChar *s1, int32_t length1,
michael@0 144 const UChar *s2, int32_t length2,
michael@0 145 uint32_t options,
michael@0 146 UErrorCode *pErrorCode) {
michael@0 147 const Normalizer2Impl *nfcImpl;
michael@0 148 const UCaseProps *csp;
michael@0 149
michael@0 150 /* current-level start/limit - s1/s2 as current */
michael@0 151 const UChar *start1, *start2, *limit1, *limit2;
michael@0 152
michael@0 153 /* decomposition and case folding variables */
michael@0 154 const UChar *p;
michael@0 155 int32_t length;
michael@0 156
michael@0 157 /* stacks of previous-level start/current/limit */
michael@0 158 CmpEquivLevel stack1[2], stack2[2];
michael@0 159
michael@0 160 /* buffers for algorithmic decompositions */
michael@0 161 UChar decomp1[4], decomp2[4];
michael@0 162
michael@0 163 /* case folding buffers, only use current-level start/limit */
michael@0 164 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
michael@0 165
michael@0 166 /* track which is the current level per string */
michael@0 167 int32_t level1, level2;
michael@0 168
michael@0 169 /* current code units, and code points for lookups */
michael@0 170 UChar32 c1, c2, cp1, cp2;
michael@0 171
michael@0 172 /* no argument error checking because this itself is not an API */
michael@0 173
michael@0 174 /*
michael@0 175 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
michael@0 176 * otherwise this function must behave exactly as uprv_strCompare()
michael@0 177 * not checking for that here makes testing this function easier
michael@0 178 */
michael@0 179
michael@0 180 /* normalization/properties data loaded? */
michael@0 181 if((options&_COMPARE_EQUIV)!=0) {
michael@0 182 nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
michael@0 183 } else {
michael@0 184 nfcImpl=NULL;
michael@0 185 }
michael@0 186 if((options&U_COMPARE_IGNORE_CASE)!=0) {
michael@0 187 csp=ucase_getSingleton();
michael@0 188 } else {
michael@0 189 csp=NULL;
michael@0 190 }
michael@0 191 if(U_FAILURE(*pErrorCode)) {
michael@0 192 return 0;
michael@0 193 }
michael@0 194
michael@0 195 /* initialize */
michael@0 196 start1=s1;
michael@0 197 if(length1==-1) {
michael@0 198 limit1=NULL;
michael@0 199 } else {
michael@0 200 limit1=s1+length1;
michael@0 201 }
michael@0 202
michael@0 203 start2=s2;
michael@0 204 if(length2==-1) {
michael@0 205 limit2=NULL;
michael@0 206 } else {
michael@0 207 limit2=s2+length2;
michael@0 208 }
michael@0 209
michael@0 210 level1=level2=0;
michael@0 211 c1=c2=-1;
michael@0 212
michael@0 213 /* comparison loop */
michael@0 214 for(;;) {
michael@0 215 /*
michael@0 216 * here a code unit value of -1 means "get another code unit"
michael@0 217 * below it will mean "this source is finished"
michael@0 218 */
michael@0 219
michael@0 220 if(c1<0) {
michael@0 221 /* get next code unit from string 1, post-increment */
michael@0 222 for(;;) {
michael@0 223 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
michael@0 224 if(level1==0) {
michael@0 225 c1=-1;
michael@0 226 break;
michael@0 227 }
michael@0 228 } else {
michael@0 229 ++s1;
michael@0 230 break;
michael@0 231 }
michael@0 232
michael@0 233 /* reached end of level buffer, pop one level */
michael@0 234 do {
michael@0 235 --level1;
michael@0 236 start1=stack1[level1].start; /*Not uninitialized*/
michael@0 237 } while(start1==NULL);
michael@0 238 s1=stack1[level1].s; /*Not uninitialized*/
michael@0 239 limit1=stack1[level1].limit; /*Not uninitialized*/
michael@0 240 }
michael@0 241 }
michael@0 242
michael@0 243 if(c2<0) {
michael@0 244 /* get next code unit from string 2, post-increment */
michael@0 245 for(;;) {
michael@0 246 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
michael@0 247 if(level2==0) {
michael@0 248 c2=-1;
michael@0 249 break;
michael@0 250 }
michael@0 251 } else {
michael@0 252 ++s2;
michael@0 253 break;
michael@0 254 }
michael@0 255
michael@0 256 /* reached end of level buffer, pop one level */
michael@0 257 do {
michael@0 258 --level2;
michael@0 259 start2=stack2[level2].start; /*Not uninitialized*/
michael@0 260 } while(start2==NULL);
michael@0 261 s2=stack2[level2].s; /*Not uninitialized*/
michael@0 262 limit2=stack2[level2].limit; /*Not uninitialized*/
michael@0 263 }
michael@0 264 }
michael@0 265
michael@0 266 /*
michael@0 267 * compare c1 and c2
michael@0 268 * either variable c1, c2 is -1 only if the corresponding string is finished
michael@0 269 */
michael@0 270 if(c1==c2) {
michael@0 271 if(c1<0) {
michael@0 272 return 0; /* c1==c2==-1 indicating end of strings */
michael@0 273 }
michael@0 274 c1=c2=-1; /* make us fetch new code units */
michael@0 275 continue;
michael@0 276 } else if(c1<0) {
michael@0 277 return -1; /* string 1 ends before string 2 */
michael@0 278 } else if(c2<0) {
michael@0 279 return 1; /* string 2 ends before string 1 */
michael@0 280 }
michael@0 281 /* c1!=c2 && c1>=0 && c2>=0 */
michael@0 282
michael@0 283 /* get complete code points for c1, c2 for lookups if either is a surrogate */
michael@0 284 cp1=c1;
michael@0 285 if(U_IS_SURROGATE(c1)) {
michael@0 286 UChar c;
michael@0 287
michael@0 288 if(U_IS_SURROGATE_LEAD(c1)) {
michael@0 289 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
michael@0 290 /* advance ++s1; only below if cp1 decomposes/case-folds */
michael@0 291 cp1=U16_GET_SUPPLEMENTARY(c1, c);
michael@0 292 }
michael@0 293 } else /* isTrail(c1) */ {
michael@0 294 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
michael@0 295 cp1=U16_GET_SUPPLEMENTARY(c, c1);
michael@0 296 }
michael@0 297 }
michael@0 298 }
michael@0 299
michael@0 300 cp2=c2;
michael@0 301 if(U_IS_SURROGATE(c2)) {
michael@0 302 UChar c;
michael@0 303
michael@0 304 if(U_IS_SURROGATE_LEAD(c2)) {
michael@0 305 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
michael@0 306 /* advance ++s2; only below if cp2 decomposes/case-folds */
michael@0 307 cp2=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 308 }
michael@0 309 } else /* isTrail(c2) */ {
michael@0 310 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
michael@0 311 cp2=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 312 }
michael@0 313 }
michael@0 314 }
michael@0 315
michael@0 316 /*
michael@0 317 * go down one level for each string
michael@0 318 * continue with the main loop as soon as there is a real change
michael@0 319 */
michael@0 320
michael@0 321 if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
michael@0 322 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
michael@0 323 ) {
michael@0 324 /* cp1 case-folds to the code point "length" or to p[length] */
michael@0 325 if(U_IS_SURROGATE(c1)) {
michael@0 326 if(U_IS_SURROGATE_LEAD(c1)) {
michael@0 327 /* advance beyond source surrogate pair if it case-folds */
michael@0 328 ++s1;
michael@0 329 } else /* isTrail(c1) */ {
michael@0 330 /*
michael@0 331 * we got a supplementary code point when hitting its trail surrogate,
michael@0 332 * therefore the lead surrogate must have been the same as in the other string;
michael@0 333 * compare this decomposition with the lead surrogate in the other string
michael@0 334 * remember that this simulates bulk text replacement:
michael@0 335 * the decomposition would replace the entire code point
michael@0 336 */
michael@0 337 --s2;
michael@0 338 c2=*(s2-1);
michael@0 339 }
michael@0 340 }
michael@0 341
michael@0 342 /* push current level pointers */
michael@0 343 stack1[0].start=start1;
michael@0 344 stack1[0].s=s1;
michael@0 345 stack1[0].limit=limit1;
michael@0 346 ++level1;
michael@0 347
michael@0 348 /* copy the folding result to fold1[] */
michael@0 349 if(length<=UCASE_MAX_STRING_LENGTH) {
michael@0 350 u_memcpy(fold1, p, length);
michael@0 351 } else {
michael@0 352 int32_t i=0;
michael@0 353 U16_APPEND_UNSAFE(fold1, i, length);
michael@0 354 length=i;
michael@0 355 }
michael@0 356
michael@0 357 /* set next level pointers to case folding */
michael@0 358 start1=s1=fold1;
michael@0 359 limit1=fold1+length;
michael@0 360
michael@0 361 /* get ready to read from decomposition, continue with loop */
michael@0 362 c1=-1;
michael@0 363 continue;
michael@0 364 }
michael@0 365
michael@0 366 if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
michael@0 367 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
michael@0 368 ) {
michael@0 369 /* cp2 case-folds to the code point "length" or to p[length] */
michael@0 370 if(U_IS_SURROGATE(c2)) {
michael@0 371 if(U_IS_SURROGATE_LEAD(c2)) {
michael@0 372 /* advance beyond source surrogate pair if it case-folds */
michael@0 373 ++s2;
michael@0 374 } else /* isTrail(c2) */ {
michael@0 375 /*
michael@0 376 * we got a supplementary code point when hitting its trail surrogate,
michael@0 377 * therefore the lead surrogate must have been the same as in the other string;
michael@0 378 * compare this decomposition with the lead surrogate in the other string
michael@0 379 * remember that this simulates bulk text replacement:
michael@0 380 * the decomposition would replace the entire code point
michael@0 381 */
michael@0 382 --s1;
michael@0 383 c1=*(s1-1);
michael@0 384 }
michael@0 385 }
michael@0 386
michael@0 387 /* push current level pointers */
michael@0 388 stack2[0].start=start2;
michael@0 389 stack2[0].s=s2;
michael@0 390 stack2[0].limit=limit2;
michael@0 391 ++level2;
michael@0 392
michael@0 393 /* copy the folding result to fold2[] */
michael@0 394 if(length<=UCASE_MAX_STRING_LENGTH) {
michael@0 395 u_memcpy(fold2, p, length);
michael@0 396 } else {
michael@0 397 int32_t i=0;
michael@0 398 U16_APPEND_UNSAFE(fold2, i, length);
michael@0 399 length=i;
michael@0 400 }
michael@0 401
michael@0 402 /* set next level pointers to case folding */
michael@0 403 start2=s2=fold2;
michael@0 404 limit2=fold2+length;
michael@0 405
michael@0 406 /* get ready to read from decomposition, continue with loop */
michael@0 407 c2=-1;
michael@0 408 continue;
michael@0 409 }
michael@0 410
michael@0 411 if( level1<2 && (options&_COMPARE_EQUIV) &&
michael@0 412 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
michael@0 413 ) {
michael@0 414 /* cp1 decomposes into p[length] */
michael@0 415 if(U_IS_SURROGATE(c1)) {
michael@0 416 if(U_IS_SURROGATE_LEAD(c1)) {
michael@0 417 /* advance beyond source surrogate pair if it decomposes */
michael@0 418 ++s1;
michael@0 419 } else /* isTrail(c1) */ {
michael@0 420 /*
michael@0 421 * we got a supplementary code point when hitting its trail surrogate,
michael@0 422 * therefore the lead surrogate must have been the same as in the other string;
michael@0 423 * compare this decomposition with the lead surrogate in the other string
michael@0 424 * remember that this simulates bulk text replacement:
michael@0 425 * the decomposition would replace the entire code point
michael@0 426 */
michael@0 427 --s2;
michael@0 428 c2=*(s2-1);
michael@0 429 }
michael@0 430 }
michael@0 431
michael@0 432 /* push current level pointers */
michael@0 433 stack1[level1].start=start1;
michael@0 434 stack1[level1].s=s1;
michael@0 435 stack1[level1].limit=limit1;
michael@0 436 ++level1;
michael@0 437
michael@0 438 /* set empty intermediate level if skipped */
michael@0 439 if(level1<2) {
michael@0 440 stack1[level1++].start=NULL;
michael@0 441 }
michael@0 442
michael@0 443 /* set next level pointers to decomposition */
michael@0 444 start1=s1=p;
michael@0 445 limit1=p+length;
michael@0 446
michael@0 447 /* get ready to read from decomposition, continue with loop */
michael@0 448 c1=-1;
michael@0 449 continue;
michael@0 450 }
michael@0 451
michael@0 452 if( level2<2 && (options&_COMPARE_EQUIV) &&
michael@0 453 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
michael@0 454 ) {
michael@0 455 /* cp2 decomposes into p[length] */
michael@0 456 if(U_IS_SURROGATE(c2)) {
michael@0 457 if(U_IS_SURROGATE_LEAD(c2)) {
michael@0 458 /* advance beyond source surrogate pair if it decomposes */
michael@0 459 ++s2;
michael@0 460 } else /* isTrail(c2) */ {
michael@0 461 /*
michael@0 462 * we got a supplementary code point when hitting its trail surrogate,
michael@0 463 * therefore the lead surrogate must have been the same as in the other string;
michael@0 464 * compare this decomposition with the lead surrogate in the other string
michael@0 465 * remember that this simulates bulk text replacement:
michael@0 466 * the decomposition would replace the entire code point
michael@0 467 */
michael@0 468 --s1;
michael@0 469 c1=*(s1-1);
michael@0 470 }
michael@0 471 }
michael@0 472
michael@0 473 /* push current level pointers */
michael@0 474 stack2[level2].start=start2;
michael@0 475 stack2[level2].s=s2;
michael@0 476 stack2[level2].limit=limit2;
michael@0 477 ++level2;
michael@0 478
michael@0 479 /* set empty intermediate level if skipped */
michael@0 480 if(level2<2) {
michael@0 481 stack2[level2++].start=NULL;
michael@0 482 }
michael@0 483
michael@0 484 /* set next level pointers to decomposition */
michael@0 485 start2=s2=p;
michael@0 486 limit2=p+length;
michael@0 487
michael@0 488 /* get ready to read from decomposition, continue with loop */
michael@0 489 c2=-1;
michael@0 490 continue;
michael@0 491 }
michael@0 492
michael@0 493 /*
michael@0 494 * no decomposition/case folding, max level for both sides:
michael@0 495 * return difference result
michael@0 496 *
michael@0 497 * code point order comparison must not just return cp1-cp2
michael@0 498 * because when single surrogates are present then the surrogate pairs
michael@0 499 * that formed cp1 and cp2 may be from different string indexes
michael@0 500 *
michael@0 501 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
michael@0 502 * c1=d800 cp1=10001 c2=dc00 cp2=10000
michael@0 503 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
michael@0 504 *
michael@0 505 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
michael@0 506 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
michael@0 507 * so we have slightly different pointer/start/limit comparisons here
michael@0 508 */
michael@0 509
michael@0 510 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
michael@0 511 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
michael@0 512 if(
michael@0 513 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
michael@0 514 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
michael@0 515 ) {
michael@0 516 /* part of a surrogate pair, leave >=d800 */
michael@0 517 } else {
michael@0 518 /* BMP code point - may be surrogate code point - make <d800 */
michael@0 519 c1-=0x2800;
michael@0 520 }
michael@0 521
michael@0 522 if(
michael@0 523 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
michael@0 524 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
michael@0 525 ) {
michael@0 526 /* part of a surrogate pair, leave >=d800 */
michael@0 527 } else {
michael@0 528 /* BMP code point - may be surrogate code point - make <d800 */
michael@0 529 c2-=0x2800;
michael@0 530 }
michael@0 531 }
michael@0 532
michael@0 533 return c1-c2;
michael@0 534 }
michael@0 535 }
michael@0 536
michael@0 537 static
michael@0 538 UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length,
michael@0 539 UnicodeString &normalized, UErrorCode *pErrorCode) {
michael@0 540 UnicodeString str(length<0, s, length);
michael@0 541
michael@0 542 // check if s fulfill the conditions
michael@0 543 int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
michael@0 544 if (U_FAILURE(*pErrorCode)) {
michael@0 545 return FALSE;
michael@0 546 }
michael@0 547 /*
michael@0 548 * ICU 2.4 had a further optimization:
michael@0 549 * If both strings were not in FCD, then they were both NFD'ed,
michael@0 550 * and the _COMPARE_EQUIV option was turned off.
michael@0 551 * It is not entirely clear that this is valid with the current
michael@0 552 * definition of the canonical caseless match.
michael@0 553 * Therefore, ICU 2.6 removes that optimization.
michael@0 554 */
michael@0 555 if(spanQCYes<str.length()) {
michael@0 556 UnicodeString unnormalized=str.tempSubString(spanQCYes);
michael@0 557 normalized.setTo(FALSE, str.getBuffer(), spanQCYes);
michael@0 558 n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
michael@0 559 if (U_SUCCESS(*pErrorCode)) {
michael@0 560 return TRUE;
michael@0 561 }
michael@0 562 }
michael@0 563 return FALSE;
michael@0 564 }
michael@0 565
michael@0 566 U_CAPI int32_t U_EXPORT2
michael@0 567 unorm_compare(const UChar *s1, int32_t length1,
michael@0 568 const UChar *s2, int32_t length2,
michael@0 569 uint32_t options,
michael@0 570 UErrorCode *pErrorCode) {
michael@0 571 /* argument checking */
michael@0 572 if(U_FAILURE(*pErrorCode)) {
michael@0 573 return 0;
michael@0 574 }
michael@0 575 if(s1==0 || length1<-1 || s2==0 || length2<-1) {
michael@0 576 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 577 return 0;
michael@0 578 }
michael@0 579
michael@0 580 UnicodeString fcd1, fcd2;
michael@0 581 int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
michael@0 582 options|=_COMPARE_EQUIV;
michael@0 583
michael@0 584 /*
michael@0 585 * UAX #21 Case Mappings, as fixed for Unicode version 4
michael@0 586 * (see Jitterbug 2021), defines a canonical caseless match as
michael@0 587 *
michael@0 588 * A string X is a canonical caseless match
michael@0 589 * for a string Y if and only if
michael@0 590 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
michael@0 591 *
michael@0 592 * For better performance, we check for FCD (or let the caller tell us that
michael@0 593 * both strings are in FCD) for the inner normalization.
michael@0 594 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
michael@0 595 * case-folding preserves the FCD-ness of a string.
michael@0 596 * The outer normalization is then only performed by unorm_cmpEquivFold()
michael@0 597 * when there is a difference.
michael@0 598 *
michael@0 599 * Exception: When using the Turkic case-folding option, we do perform
michael@0 600 * full NFD first. This is because in the Turkic case precomposed characters
michael@0 601 * with 0049 capital I or 0069 small i fold differently whether they
michael@0 602 * are first decomposed or not, so an FCD check - a check only for
michael@0 603 * canonical order - is not sufficient.
michael@0 604 */
michael@0 605 if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
michael@0 606 const Normalizer2 *n2;
michael@0 607 if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
michael@0 608 n2=Normalizer2Factory::getNFDInstance(*pErrorCode);
michael@0 609 } else {
michael@0 610 n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
michael@0 611 }
michael@0 612 if (U_FAILURE(*pErrorCode)) {
michael@0 613 return 0;
michael@0 614 }
michael@0 615
michael@0 616 if(normOptions&UNORM_UNICODE_3_2) {
michael@0 617 const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode);
michael@0 618 FilteredNormalizer2 fn2(*n2, *uni32);
michael@0 619 if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
michael@0 620 s1=fcd1.getBuffer();
michael@0 621 length1=fcd1.length();
michael@0 622 }
michael@0 623 if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
michael@0 624 s2=fcd2.getBuffer();
michael@0 625 length2=fcd2.length();
michael@0 626 }
michael@0 627 } else {
michael@0 628 if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
michael@0 629 s1=fcd1.getBuffer();
michael@0 630 length1=fcd1.length();
michael@0 631 }
michael@0 632 if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
michael@0 633 s2=fcd2.getBuffer();
michael@0 634 length2=fcd2.length();
michael@0 635 }
michael@0 636 }
michael@0 637 }
michael@0 638
michael@0 639 if(U_SUCCESS(*pErrorCode)) {
michael@0 640 return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
michael@0 641 } else {
michael@0 642 return 0;
michael@0 643 }
michael@0 644 }
michael@0 645
michael@0 646 #endif /* #if !UCONFIG_NO_NORMALIZATION */

mercurial