1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unormcmp.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,646 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2001-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: unormcmp.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2004sep13 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* unorm_compare() function moved here from unorm.cpp for better modularization. 1.20 +* Depends on both normalization and case folding. 1.21 +* Allows unorm.cpp to not depend on any character properties code. 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 + 1.26 +#if !UCONFIG_NO_NORMALIZATION 1.27 + 1.28 +#include "unicode/unorm.h" 1.29 +#include "unicode/ustring.h" 1.30 +#include "cmemory.h" 1.31 +#include "normalizer2impl.h" 1.32 +#include "ucase.h" 1.33 +#include "uprops.h" 1.34 +#include "ustr_imp.h" 1.35 + 1.36 +U_NAMESPACE_USE 1.37 + 1.38 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.39 + 1.40 +/* compare canonically equivalent ------------------------------------------- */ 1.41 + 1.42 +/* 1.43 + * Compare two strings for canonical equivalence. 1.44 + * Further options include case-insensitive comparison and 1.45 + * code point order (as opposed to code unit order). 1.46 + * 1.47 + * In this function, canonical equivalence is optional as well. 1.48 + * If canonical equivalence is tested, then both strings must fulfill 1.49 + * the FCD check. 1.50 + * 1.51 + * Semantically, this is equivalent to 1.52 + * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 1.53 + * where code point order, NFD and foldCase are all optional. 1.54 + * 1.55 + * String comparisons almost always yield results before processing both strings 1.56 + * completely. 1.57 + * They are generally more efficient working incrementally instead of 1.58 + * performing the sub-processing (strlen, normalization, case-folding) 1.59 + * on the entire strings first. 1.60 + * 1.61 + * It is also unnecessary to not normalize identical characters. 1.62 + * 1.63 + * This function works in principle as follows: 1.64 + * 1.65 + * loop { 1.66 + * get one code unit c1 from s1 (-1 if end of source) 1.67 + * get one code unit c2 from s2 (-1 if end of source) 1.68 + * 1.69 + * if(either string finished) { 1.70 + * return result; 1.71 + * } 1.72 + * if(c1==c2) { 1.73 + * continue; 1.74 + * } 1.75 + * 1.76 + * // c1!=c2 1.77 + * try to decompose/case-fold c1/c2, and continue if one does; 1.78 + * 1.79 + * // still c1!=c2 and neither decomposes/case-folds, return result 1.80 + * return c1-c2; 1.81 + * } 1.82 + * 1.83 + * When a character decomposes, then the pointer for that source changes to 1.84 + * the decomposition, pushing the previous pointer onto a stack. 1.85 + * When the end of the decomposition is reached, then the code unit reader 1.86 + * pops the previous source from the stack. 1.87 + * (Same for case-folding.) 1.88 + * 1.89 + * This is complicated further by operating on variable-width UTF-16. 1.90 + * The top part of the loop works on code units, while lookups for decomposition 1.91 + * and case-folding need code points. 1.92 + * Code points are assembled after the equality/end-of-source part. 1.93 + * The source pointer is only advanced beyond all code units when the code point 1.94 + * actually decomposes/case-folds. 1.95 + * 1.96 + * If we were on a trail surrogate unit when assembling a code point, 1.97 + * and the code point decomposes/case-folds, then the decomposition/folding 1.98 + * result must be compared with the part of the other string that corresponds to 1.99 + * this string's lead surrogate. 1.100 + * Since we only assemble a code point when hitting a trail unit when the 1.101 + * preceding lead units were identical, we back up the other string by one unit 1.102 + * in such a case. 1.103 + * 1.104 + * The optional code point order comparison at the end works with 1.105 + * the same fix-up as the other code point order comparison functions. 1.106 + * See ustring.c and the comment near the end of this function. 1.107 + * 1.108 + * Assumption: A decomposition or case-folding result string never contains 1.109 + * a single surrogate. This is a safe assumption in the Unicode Standard. 1.110 + * Therefore, we do not need to check for surrogate pairs across 1.111 + * decomposition/case-folding boundaries. 1.112 + * 1.113 + * Further assumptions (see verifications tstnorm.cpp): 1.114 + * The API function checks for FCD first, while the core function 1.115 + * first case-folds and then decomposes. This requires that case-folding does not 1.116 + * un-FCD any strings. 1.117 + * 1.118 + * The API function may also NFD the input and turn off decomposition. 1.119 + * This requires that case-folding does not un-NFD strings either. 1.120 + * 1.121 + * TODO If any of the above two assumptions is violated, 1.122 + * then this entire code must be re-thought. 1.123 + * If this happens, then a simple solution is to case-fold both strings up front 1.124 + * and to turn off UNORM_INPUT_IS_FCD. 1.125 + * We already do this when not both strings are in FCD because makeFCD 1.126 + * would be a partial NFD before the case folding, which does not work. 1.127 + * Note that all of this is only a problem when case-folding _and_ 1.128 + * canonical equivalence come together. 1.129 + * (Comments in unorm_compare() are more up to date than this TODO.) 1.130 + */ 1.131 + 1.132 +/* stack element for previous-level source/decomposition pointers */ 1.133 +struct CmpEquivLevel { 1.134 + const UChar *start, *s, *limit; 1.135 +}; 1.136 +typedef struct CmpEquivLevel CmpEquivLevel; 1.137 + 1.138 +/** 1.139 + * Internal option for unorm_cmpEquivFold() for decomposing. 1.140 + * If not set, just do strcasecmp(). 1.141 + */ 1.142 +#define _COMPARE_EQUIV 0x80000 1.143 + 1.144 +/* internal function */ 1.145 +static int32_t 1.146 +unorm_cmpEquivFold(const UChar *s1, int32_t length1, 1.147 + const UChar *s2, int32_t length2, 1.148 + uint32_t options, 1.149 + UErrorCode *pErrorCode) { 1.150 + const Normalizer2Impl *nfcImpl; 1.151 + const UCaseProps *csp; 1.152 + 1.153 + /* current-level start/limit - s1/s2 as current */ 1.154 + const UChar *start1, *start2, *limit1, *limit2; 1.155 + 1.156 + /* decomposition and case folding variables */ 1.157 + const UChar *p; 1.158 + int32_t length; 1.159 + 1.160 + /* stacks of previous-level start/current/limit */ 1.161 + CmpEquivLevel stack1[2], stack2[2]; 1.162 + 1.163 + /* buffers for algorithmic decompositions */ 1.164 + UChar decomp1[4], decomp2[4]; 1.165 + 1.166 + /* case folding buffers, only use current-level start/limit */ 1.167 + UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 1.168 + 1.169 + /* track which is the current level per string */ 1.170 + int32_t level1, level2; 1.171 + 1.172 + /* current code units, and code points for lookups */ 1.173 + UChar32 c1, c2, cp1, cp2; 1.174 + 1.175 + /* no argument error checking because this itself is not an API */ 1.176 + 1.177 + /* 1.178 + * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 1.179 + * otherwise this function must behave exactly as uprv_strCompare() 1.180 + * not checking for that here makes testing this function easier 1.181 + */ 1.182 + 1.183 + /* normalization/properties data loaded? */ 1.184 + if((options&_COMPARE_EQUIV)!=0) { 1.185 + nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode); 1.186 + } else { 1.187 + nfcImpl=NULL; 1.188 + } 1.189 + if((options&U_COMPARE_IGNORE_CASE)!=0) { 1.190 + csp=ucase_getSingleton(); 1.191 + } else { 1.192 + csp=NULL; 1.193 + } 1.194 + if(U_FAILURE(*pErrorCode)) { 1.195 + return 0; 1.196 + } 1.197 + 1.198 + /* initialize */ 1.199 + start1=s1; 1.200 + if(length1==-1) { 1.201 + limit1=NULL; 1.202 + } else { 1.203 + limit1=s1+length1; 1.204 + } 1.205 + 1.206 + start2=s2; 1.207 + if(length2==-1) { 1.208 + limit2=NULL; 1.209 + } else { 1.210 + limit2=s2+length2; 1.211 + } 1.212 + 1.213 + level1=level2=0; 1.214 + c1=c2=-1; 1.215 + 1.216 + /* comparison loop */ 1.217 + for(;;) { 1.218 + /* 1.219 + * here a code unit value of -1 means "get another code unit" 1.220 + * below it will mean "this source is finished" 1.221 + */ 1.222 + 1.223 + if(c1<0) { 1.224 + /* get next code unit from string 1, post-increment */ 1.225 + for(;;) { 1.226 + if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 1.227 + if(level1==0) { 1.228 + c1=-1; 1.229 + break; 1.230 + } 1.231 + } else { 1.232 + ++s1; 1.233 + break; 1.234 + } 1.235 + 1.236 + /* reached end of level buffer, pop one level */ 1.237 + do { 1.238 + --level1; 1.239 + start1=stack1[level1].start; /*Not uninitialized*/ 1.240 + } while(start1==NULL); 1.241 + s1=stack1[level1].s; /*Not uninitialized*/ 1.242 + limit1=stack1[level1].limit; /*Not uninitialized*/ 1.243 + } 1.244 + } 1.245 + 1.246 + if(c2<0) { 1.247 + /* get next code unit from string 2, post-increment */ 1.248 + for(;;) { 1.249 + if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 1.250 + if(level2==0) { 1.251 + c2=-1; 1.252 + break; 1.253 + } 1.254 + } else { 1.255 + ++s2; 1.256 + break; 1.257 + } 1.258 + 1.259 + /* reached end of level buffer, pop one level */ 1.260 + do { 1.261 + --level2; 1.262 + start2=stack2[level2].start; /*Not uninitialized*/ 1.263 + } while(start2==NULL); 1.264 + s2=stack2[level2].s; /*Not uninitialized*/ 1.265 + limit2=stack2[level2].limit; /*Not uninitialized*/ 1.266 + } 1.267 + } 1.268 + 1.269 + /* 1.270 + * compare c1 and c2 1.271 + * either variable c1, c2 is -1 only if the corresponding string is finished 1.272 + */ 1.273 + if(c1==c2) { 1.274 + if(c1<0) { 1.275 + return 0; /* c1==c2==-1 indicating end of strings */ 1.276 + } 1.277 + c1=c2=-1; /* make us fetch new code units */ 1.278 + continue; 1.279 + } else if(c1<0) { 1.280 + return -1; /* string 1 ends before string 2 */ 1.281 + } else if(c2<0) { 1.282 + return 1; /* string 2 ends before string 1 */ 1.283 + } 1.284 + /* c1!=c2 && c1>=0 && c2>=0 */ 1.285 + 1.286 + /* get complete code points for c1, c2 for lookups if either is a surrogate */ 1.287 + cp1=c1; 1.288 + if(U_IS_SURROGATE(c1)) { 1.289 + UChar c; 1.290 + 1.291 + if(U_IS_SURROGATE_LEAD(c1)) { 1.292 + if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 1.293 + /* advance ++s1; only below if cp1 decomposes/case-folds */ 1.294 + cp1=U16_GET_SUPPLEMENTARY(c1, c); 1.295 + } 1.296 + } else /* isTrail(c1) */ { 1.297 + if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 1.298 + cp1=U16_GET_SUPPLEMENTARY(c, c1); 1.299 + } 1.300 + } 1.301 + } 1.302 + 1.303 + cp2=c2; 1.304 + if(U_IS_SURROGATE(c2)) { 1.305 + UChar c; 1.306 + 1.307 + if(U_IS_SURROGATE_LEAD(c2)) { 1.308 + if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 1.309 + /* advance ++s2; only below if cp2 decomposes/case-folds */ 1.310 + cp2=U16_GET_SUPPLEMENTARY(c2, c); 1.311 + } 1.312 + } else /* isTrail(c2) */ { 1.313 + if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 1.314 + cp2=U16_GET_SUPPLEMENTARY(c, c2); 1.315 + } 1.316 + } 1.317 + } 1.318 + 1.319 + /* 1.320 + * go down one level for each string 1.321 + * continue with the main loop as soon as there is a real change 1.322 + */ 1.323 + 1.324 + if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && 1.325 + (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 1.326 + ) { 1.327 + /* cp1 case-folds to the code point "length" or to p[length] */ 1.328 + if(U_IS_SURROGATE(c1)) { 1.329 + if(U_IS_SURROGATE_LEAD(c1)) { 1.330 + /* advance beyond source surrogate pair if it case-folds */ 1.331 + ++s1; 1.332 + } else /* isTrail(c1) */ { 1.333 + /* 1.334 + * we got a supplementary code point when hitting its trail surrogate, 1.335 + * therefore the lead surrogate must have been the same as in the other string; 1.336 + * compare this decomposition with the lead surrogate in the other string 1.337 + * remember that this simulates bulk text replacement: 1.338 + * the decomposition would replace the entire code point 1.339 + */ 1.340 + --s2; 1.341 + c2=*(s2-1); 1.342 + } 1.343 + } 1.344 + 1.345 + /* push current level pointers */ 1.346 + stack1[0].start=start1; 1.347 + stack1[0].s=s1; 1.348 + stack1[0].limit=limit1; 1.349 + ++level1; 1.350 + 1.351 + /* copy the folding result to fold1[] */ 1.352 + if(length<=UCASE_MAX_STRING_LENGTH) { 1.353 + u_memcpy(fold1, p, length); 1.354 + } else { 1.355 + int32_t i=0; 1.356 + U16_APPEND_UNSAFE(fold1, i, length); 1.357 + length=i; 1.358 + } 1.359 + 1.360 + /* set next level pointers to case folding */ 1.361 + start1=s1=fold1; 1.362 + limit1=fold1+length; 1.363 + 1.364 + /* get ready to read from decomposition, continue with loop */ 1.365 + c1=-1; 1.366 + continue; 1.367 + } 1.368 + 1.369 + if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && 1.370 + (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 1.371 + ) { 1.372 + /* cp2 case-folds to the code point "length" or to p[length] */ 1.373 + if(U_IS_SURROGATE(c2)) { 1.374 + if(U_IS_SURROGATE_LEAD(c2)) { 1.375 + /* advance beyond source surrogate pair if it case-folds */ 1.376 + ++s2; 1.377 + } else /* isTrail(c2) */ { 1.378 + /* 1.379 + * we got a supplementary code point when hitting its trail surrogate, 1.380 + * therefore the lead surrogate must have been the same as in the other string; 1.381 + * compare this decomposition with the lead surrogate in the other string 1.382 + * remember that this simulates bulk text replacement: 1.383 + * the decomposition would replace the entire code point 1.384 + */ 1.385 + --s1; 1.386 + c1=*(s1-1); 1.387 + } 1.388 + } 1.389 + 1.390 + /* push current level pointers */ 1.391 + stack2[0].start=start2; 1.392 + stack2[0].s=s2; 1.393 + stack2[0].limit=limit2; 1.394 + ++level2; 1.395 + 1.396 + /* copy the folding result to fold2[] */ 1.397 + if(length<=UCASE_MAX_STRING_LENGTH) { 1.398 + u_memcpy(fold2, p, length); 1.399 + } else { 1.400 + int32_t i=0; 1.401 + U16_APPEND_UNSAFE(fold2, i, length); 1.402 + length=i; 1.403 + } 1.404 + 1.405 + /* set next level pointers to case folding */ 1.406 + start2=s2=fold2; 1.407 + limit2=fold2+length; 1.408 + 1.409 + /* get ready to read from decomposition, continue with loop */ 1.410 + c2=-1; 1.411 + continue; 1.412 + } 1.413 + 1.414 + if( level1<2 && (options&_COMPARE_EQUIV) && 1.415 + 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length)) 1.416 + ) { 1.417 + /* cp1 decomposes into p[length] */ 1.418 + if(U_IS_SURROGATE(c1)) { 1.419 + if(U_IS_SURROGATE_LEAD(c1)) { 1.420 + /* advance beyond source surrogate pair if it decomposes */ 1.421 + ++s1; 1.422 + } else /* isTrail(c1) */ { 1.423 + /* 1.424 + * we got a supplementary code point when hitting its trail surrogate, 1.425 + * therefore the lead surrogate must have been the same as in the other string; 1.426 + * compare this decomposition with the lead surrogate in the other string 1.427 + * remember that this simulates bulk text replacement: 1.428 + * the decomposition would replace the entire code point 1.429 + */ 1.430 + --s2; 1.431 + c2=*(s2-1); 1.432 + } 1.433 + } 1.434 + 1.435 + /* push current level pointers */ 1.436 + stack1[level1].start=start1; 1.437 + stack1[level1].s=s1; 1.438 + stack1[level1].limit=limit1; 1.439 + ++level1; 1.440 + 1.441 + /* set empty intermediate level if skipped */ 1.442 + if(level1<2) { 1.443 + stack1[level1++].start=NULL; 1.444 + } 1.445 + 1.446 + /* set next level pointers to decomposition */ 1.447 + start1=s1=p; 1.448 + limit1=p+length; 1.449 + 1.450 + /* get ready to read from decomposition, continue with loop */ 1.451 + c1=-1; 1.452 + continue; 1.453 + } 1.454 + 1.455 + if( level2<2 && (options&_COMPARE_EQUIV) && 1.456 + 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length)) 1.457 + ) { 1.458 + /* cp2 decomposes into p[length] */ 1.459 + if(U_IS_SURROGATE(c2)) { 1.460 + if(U_IS_SURROGATE_LEAD(c2)) { 1.461 + /* advance beyond source surrogate pair if it decomposes */ 1.462 + ++s2; 1.463 + } else /* isTrail(c2) */ { 1.464 + /* 1.465 + * we got a supplementary code point when hitting its trail surrogate, 1.466 + * therefore the lead surrogate must have been the same as in the other string; 1.467 + * compare this decomposition with the lead surrogate in the other string 1.468 + * remember that this simulates bulk text replacement: 1.469 + * the decomposition would replace the entire code point 1.470 + */ 1.471 + --s1; 1.472 + c1=*(s1-1); 1.473 + } 1.474 + } 1.475 + 1.476 + /* push current level pointers */ 1.477 + stack2[level2].start=start2; 1.478 + stack2[level2].s=s2; 1.479 + stack2[level2].limit=limit2; 1.480 + ++level2; 1.481 + 1.482 + /* set empty intermediate level if skipped */ 1.483 + if(level2<2) { 1.484 + stack2[level2++].start=NULL; 1.485 + } 1.486 + 1.487 + /* set next level pointers to decomposition */ 1.488 + start2=s2=p; 1.489 + limit2=p+length; 1.490 + 1.491 + /* get ready to read from decomposition, continue with loop */ 1.492 + c2=-1; 1.493 + continue; 1.494 + } 1.495 + 1.496 + /* 1.497 + * no decomposition/case folding, max level for both sides: 1.498 + * return difference result 1.499 + * 1.500 + * code point order comparison must not just return cp1-cp2 1.501 + * because when single surrogates are present then the surrogate pairs 1.502 + * that formed cp1 and cp2 may be from different string indexes 1.503 + * 1.504 + * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 1.505 + * c1=d800 cp1=10001 c2=dc00 cp2=10000 1.506 + * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 1.507 + * 1.508 + * therefore, use same fix-up as in ustring.c/uprv_strCompare() 1.509 + * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 1.510 + * so we have slightly different pointer/start/limit comparisons here 1.511 + */ 1.512 + 1.513 + if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 1.514 + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 1.515 + if( 1.516 + (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 1.517 + (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 1.518 + ) { 1.519 + /* part of a surrogate pair, leave >=d800 */ 1.520 + } else { 1.521 + /* BMP code point - may be surrogate code point - make <d800 */ 1.522 + c1-=0x2800; 1.523 + } 1.524 + 1.525 + if( 1.526 + (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 1.527 + (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 1.528 + ) { 1.529 + /* part of a surrogate pair, leave >=d800 */ 1.530 + } else { 1.531 + /* BMP code point - may be surrogate code point - make <d800 */ 1.532 + c2-=0x2800; 1.533 + } 1.534 + } 1.535 + 1.536 + return c1-c2; 1.537 + } 1.538 +} 1.539 + 1.540 +static 1.541 +UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length, 1.542 + UnicodeString &normalized, UErrorCode *pErrorCode) { 1.543 + UnicodeString str(length<0, s, length); 1.544 + 1.545 + // check if s fulfill the conditions 1.546 + int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode); 1.547 + if (U_FAILURE(*pErrorCode)) { 1.548 + return FALSE; 1.549 + } 1.550 + /* 1.551 + * ICU 2.4 had a further optimization: 1.552 + * If both strings were not in FCD, then they were both NFD'ed, 1.553 + * and the _COMPARE_EQUIV option was turned off. 1.554 + * It is not entirely clear that this is valid with the current 1.555 + * definition of the canonical caseless match. 1.556 + * Therefore, ICU 2.6 removes that optimization. 1.557 + */ 1.558 + if(spanQCYes<str.length()) { 1.559 + UnicodeString unnormalized=str.tempSubString(spanQCYes); 1.560 + normalized.setTo(FALSE, str.getBuffer(), spanQCYes); 1.561 + n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode); 1.562 + if (U_SUCCESS(*pErrorCode)) { 1.563 + return TRUE; 1.564 + } 1.565 + } 1.566 + return FALSE; 1.567 +} 1.568 + 1.569 +U_CAPI int32_t U_EXPORT2 1.570 +unorm_compare(const UChar *s1, int32_t length1, 1.571 + const UChar *s2, int32_t length2, 1.572 + uint32_t options, 1.573 + UErrorCode *pErrorCode) { 1.574 + /* argument checking */ 1.575 + if(U_FAILURE(*pErrorCode)) { 1.576 + return 0; 1.577 + } 1.578 + if(s1==0 || length1<-1 || s2==0 || length2<-1) { 1.579 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.580 + return 0; 1.581 + } 1.582 + 1.583 + UnicodeString fcd1, fcd2; 1.584 + int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); 1.585 + options|=_COMPARE_EQUIV; 1.586 + 1.587 + /* 1.588 + * UAX #21 Case Mappings, as fixed for Unicode version 4 1.589 + * (see Jitterbug 2021), defines a canonical caseless match as 1.590 + * 1.591 + * A string X is a canonical caseless match 1.592 + * for a string Y if and only if 1.593 + * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1.594 + * 1.595 + * For better performance, we check for FCD (or let the caller tell us that 1.596 + * both strings are in FCD) for the inner normalization. 1.597 + * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 1.598 + * case-folding preserves the FCD-ness of a string. 1.599 + * The outer normalization is then only performed by unorm_cmpEquivFold() 1.600 + * when there is a difference. 1.601 + * 1.602 + * Exception: When using the Turkic case-folding option, we do perform 1.603 + * full NFD first. This is because in the Turkic case precomposed characters 1.604 + * with 0049 capital I or 0069 small i fold differently whether they 1.605 + * are first decomposed or not, so an FCD check - a check only for 1.606 + * canonical order - is not sufficient. 1.607 + */ 1.608 + if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) { 1.609 + const Normalizer2 *n2; 1.610 + if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { 1.611 + n2=Normalizer2Factory::getNFDInstance(*pErrorCode); 1.612 + } else { 1.613 + n2=Normalizer2Factory::getFCDInstance(*pErrorCode); 1.614 + } 1.615 + if (U_FAILURE(*pErrorCode)) { 1.616 + return 0; 1.617 + } 1.618 + 1.619 + if(normOptions&UNORM_UNICODE_3_2) { 1.620 + const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode); 1.621 + FilteredNormalizer2 fn2(*n2, *uni32); 1.622 + if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) { 1.623 + s1=fcd1.getBuffer(); 1.624 + length1=fcd1.length(); 1.625 + } 1.626 + if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) { 1.627 + s2=fcd2.getBuffer(); 1.628 + length2=fcd2.length(); 1.629 + } 1.630 + } else { 1.631 + if(_normalize(n2, s1, length1, fcd1, pErrorCode)) { 1.632 + s1=fcd1.getBuffer(); 1.633 + length1=fcd1.length(); 1.634 + } 1.635 + if(_normalize(n2, s2, length2, fcd2, pErrorCode)) { 1.636 + s2=fcd2.getBuffer(); 1.637 + length2=fcd2.length(); 1.638 + } 1.639 + } 1.640 + } 1.641 + 1.642 + if(U_SUCCESS(*pErrorCode)) { 1.643 + return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); 1.644 + } else { 1.645 + return 0; 1.646 + } 1.647 +} 1.648 + 1.649 +#endif /* #if !UCONFIG_NO_NORMALIZATION */