1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/usearch.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,4842 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2001-2011 IBM and others. All rights reserved. 1.7 +********************************************************************** 1.8 +* Date Name Description 1.9 +* 07/02/2001 synwee Creation. 1.10 +********************************************************************** 1.11 +*/ 1.12 + 1.13 +#include "unicode/utypes.h" 1.14 + 1.15 +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 1.16 + 1.17 +#include "unicode/usearch.h" 1.18 +#include "unicode/ustring.h" 1.19 +#include "unicode/uchar.h" 1.20 +#include "unicode/utf16.h" 1.21 +#include "normalizer2impl.h" 1.22 +#include "ucol_imp.h" 1.23 +#include "usrchimp.h" 1.24 +#include "cmemory.h" 1.25 +#include "ucln_in.h" 1.26 +#include "uassert.h" 1.27 +#include "ustr_imp.h" 1.28 + 1.29 +U_NAMESPACE_USE 1.30 + 1.31 +// don't use Boyer-Moore 1.32 +// (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 1.33 +#define BOYER_MOORE 0 1.34 + 1.35 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.36 + 1.37 +// internal definition --------------------------------------------------- 1.38 + 1.39 +#define LAST_BYTE_MASK_ 0xFF 1.40 +#define SECOND_LAST_BYTE_SHIFT_ 8 1.41 +#define SUPPLEMENTARY_MIN_VALUE_ 0x10000 1.42 + 1.43 +static const Normalizer2Impl *g_nfcImpl = NULL; 1.44 + 1.45 +// internal methods ------------------------------------------------- 1.46 + 1.47 +/** 1.48 +* Fast collation element iterator setOffset. 1.49 +* This function does not check for bounds. 1.50 +* @param coleiter collation element iterator 1.51 +* @param offset to set 1.52 +*/ 1.53 +static 1.54 +inline void setColEIterOffset(UCollationElements *elems, 1.55 + int32_t offset) 1.56 +{ 1.57 + collIterate *ci = &(elems->iteratordata_); 1.58 + ci->pos = ci->string + offset; 1.59 + ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; 1.60 + if (ci->flags & UCOL_ITER_INNORMBUF) { 1.61 + ci->flags = ci->origFlags; 1.62 + } 1.63 + ci->fcdPosition = NULL; 1.64 + 1.65 + ci->offsetReturn = NULL; 1.66 + ci->offsetStore = ci->offsetBuffer; 1.67 + ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 1.68 +} 1.69 + 1.70 +/** 1.71 +* Getting the mask for collation strength 1.72 +* @param strength collation strength 1.73 +* @return collation element mask 1.74 +*/ 1.75 +static 1.76 +inline uint32_t getMask(UCollationStrength strength) 1.77 +{ 1.78 + switch (strength) 1.79 + { 1.80 + case UCOL_PRIMARY: 1.81 + return UCOL_PRIMARYORDERMASK; 1.82 + case UCOL_SECONDARY: 1.83 + return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 1.84 + default: 1.85 + return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 1.86 + UCOL_PRIMARYORDERMASK; 1.87 + } 1.88 +} 1.89 + 1.90 +/** 1.91 +* This is to squeeze the 21bit ces into a 256 table 1.92 +* @param ce collation element 1.93 +* @return collapsed version of the collation element 1.94 +*/ 1.95 +static 1.96 +inline int hash(uint32_t ce) 1.97 +{ 1.98 + // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 1.99 + // well with the new collation where most of the latin 1 characters 1.100 + // are of the value xx000xxx. their hashes will most of the time be 0 1.101 + // to be discussed on the hash algo. 1.102 + return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 1.103 +} 1.104 + 1.105 +U_CDECL_BEGIN 1.106 +static UBool U_CALLCONV 1.107 +usearch_cleanup(void) { 1.108 + g_nfcImpl = NULL; 1.109 + return TRUE; 1.110 +} 1.111 +U_CDECL_END 1.112 + 1.113 +/** 1.114 +* Initializing the fcd tables. 1.115 +* Internal method, status assumed to be a success. 1.116 +* @param status output error if any, caller to check status before calling 1.117 +* method, status assumed to be success when passed in. 1.118 +*/ 1.119 +static 1.120 +inline void initializeFCD(UErrorCode *status) 1.121 +{ 1.122 + if (g_nfcImpl == NULL) { 1.123 + g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.124 + ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 1.125 + } 1.126 +} 1.127 + 1.128 +/** 1.129 +* Gets the fcd value for a character at the argument index. 1.130 +* This method takes into accounts of the supplementary characters. 1.131 +* @param str UTF16 string where character for fcd retrieval resides 1.132 +* @param offset position of the character whose fcd is to be retrieved, to be 1.133 +* overwritten with the next character position, taking 1.134 +* surrogate characters into consideration. 1.135 +* @param strlength length of the argument string 1.136 +* @return fcd value 1.137 +*/ 1.138 +static 1.139 +uint16_t getFCD(const UChar *str, int32_t *offset, 1.140 + int32_t strlength) 1.141 +{ 1.142 + const UChar *temp = str + *offset; 1.143 + uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 1.144 + *offset = (int32_t)(temp - str); 1.145 + return result; 1.146 +} 1.147 + 1.148 +/** 1.149 +* Getting the modified collation elements taking into account the collation 1.150 +* attributes 1.151 +* @param strsrch string search data 1.152 +* @param sourcece 1.153 +* @return the modified collation element 1.154 +*/ 1.155 +static 1.156 +inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 1.157 +{ 1.158 + // note for tertiary we can't use the collator->tertiaryMask, that 1.159 + // is a preprocessed mask that takes into account case options. since 1.160 + // we are only concerned with exact matches, we don't need that. 1.161 + sourcece &= strsrch->ceMask; 1.162 + 1.163 + if (strsrch->toShift) { 1.164 + // alternate handling here, since only the 16 most significant digits 1.165 + // is only used, we can safely do a compare without masking 1.166 + // if the ce is a variable, we mask and get only the primary values 1.167 + // no shifting to quartenary is required since all primary values 1.168 + // less than variabletop will need to be masked off anyway. 1.169 + if (strsrch->variableTop > sourcece) { 1.170 + if (strsrch->strength >= UCOL_QUATERNARY) { 1.171 + sourcece &= UCOL_PRIMARYORDERMASK; 1.172 + } 1.173 + else { 1.174 + sourcece = UCOL_IGNORABLE; 1.175 + } 1.176 + } 1.177 + } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 1.178 + sourcece = 0xFFFF; 1.179 + } 1.180 + 1.181 + return sourcece; 1.182 +} 1.183 + 1.184 +/** 1.185 +* Allocate a memory and returns NULL if it failed. 1.186 +* Internal method, status assumed to be a success. 1.187 +* @param size to allocate 1.188 +* @param status output error if any, caller to check status before calling 1.189 +* method, status assumed to be success when passed in. 1.190 +* @return newly allocated array, NULL otherwise 1.191 +*/ 1.192 +static 1.193 +inline void * allocateMemory(uint32_t size, UErrorCode *status) 1.194 +{ 1.195 + uint32_t *result = (uint32_t *)uprv_malloc(size); 1.196 + if (result == NULL) { 1.197 + *status = U_MEMORY_ALLOCATION_ERROR; 1.198 + } 1.199 + return result; 1.200 +} 1.201 + 1.202 +/** 1.203 +* Adds a uint32_t value to a destination array. 1.204 +* Creates a new array if we run out of space. The caller will have to 1.205 +* manually deallocate the newly allocated array. 1.206 +* Internal method, status assumed to be success, caller has to check status 1.207 +* before calling this method. destination not to be NULL and has at least 1.208 +* size destinationlength. 1.209 +* @param destination target array 1.210 +* @param offset destination offset to add value 1.211 +* @param destinationlength target array size, return value for the new size 1.212 +* @param value to be added 1.213 +* @param increments incremental size expected 1.214 +* @param status output error if any, caller to check status before calling 1.215 +* method, status assumed to be success when passed in. 1.216 +* @return new destination array, destination if there was no new allocation 1.217 +*/ 1.218 +static 1.219 +inline int32_t * addTouint32_tArray(int32_t *destination, 1.220 + uint32_t offset, 1.221 + uint32_t *destinationlength, 1.222 + uint32_t value, 1.223 + uint32_t increments, 1.224 + UErrorCode *status) 1.225 +{ 1.226 + uint32_t newlength = *destinationlength; 1.227 + if (offset + 1 == newlength) { 1.228 + newlength += increments; 1.229 + int32_t *temp = (int32_t *)allocateMemory( 1.230 + sizeof(int32_t) * newlength, status); 1.231 + if (U_FAILURE(*status)) { 1.232 + return NULL; 1.233 + } 1.234 + uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 1.235 + *destinationlength = newlength; 1.236 + destination = temp; 1.237 + } 1.238 + destination[offset] = value; 1.239 + return destination; 1.240 +} 1.241 + 1.242 +/** 1.243 +* Adds a uint64_t value to a destination array. 1.244 +* Creates a new array if we run out of space. The caller will have to 1.245 +* manually deallocate the newly allocated array. 1.246 +* Internal method, status assumed to be success, caller has to check status 1.247 +* before calling this method. destination not to be NULL and has at least 1.248 +* size destinationlength. 1.249 +* @param destination target array 1.250 +* @param offset destination offset to add value 1.251 +* @param destinationlength target array size, return value for the new size 1.252 +* @param value to be added 1.253 +* @param increments incremental size expected 1.254 +* @param status output error if any, caller to check status before calling 1.255 +* method, status assumed to be success when passed in. 1.256 +* @return new destination array, destination if there was no new allocation 1.257 +*/ 1.258 +static 1.259 +inline int64_t * addTouint64_tArray(int64_t *destination, 1.260 + uint32_t offset, 1.261 + uint32_t *destinationlength, 1.262 + uint64_t value, 1.263 + uint32_t increments, 1.264 + UErrorCode *status) 1.265 +{ 1.266 + uint32_t newlength = *destinationlength; 1.267 + if (offset + 1 == newlength) { 1.268 + newlength += increments; 1.269 + int64_t *temp = (int64_t *)allocateMemory( 1.270 + sizeof(int64_t) * newlength, status); 1.271 + 1.272 + if (U_FAILURE(*status)) { 1.273 + return NULL; 1.274 + } 1.275 + 1.276 + uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 1.277 + *destinationlength = newlength; 1.278 + destination = temp; 1.279 + } 1.280 + 1.281 + destination[offset] = value; 1.282 + 1.283 + return destination; 1.284 +} 1.285 + 1.286 +/** 1.287 +* Initializing the ce table for a pattern. 1.288 +* Stores non-ignorable collation keys. 1.289 +* Table size will be estimated by the size of the pattern text. Table 1.290 +* expansion will be perform as we go along. Adding 1 to ensure that the table 1.291 +* size definitely increases. 1.292 +* Internal method, status assumed to be a success. 1.293 +* @param strsrch string search data 1.294 +* @param status output error if any, caller to check status before calling 1.295 +* method, status assumed to be success when passed in. 1.296 +* @return total number of expansions 1.297 +*/ 1.298 +static 1.299 +inline uint16_t initializePatternCETable(UStringSearch *strsrch, 1.300 + UErrorCode *status) 1.301 +{ 1.302 + UPattern *pattern = &(strsrch->pattern); 1.303 + uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 1.304 + int32_t *cetable = pattern->CEBuffer; 1.305 + uint32_t patternlength = pattern->textLength; 1.306 + UCollationElements *coleiter = strsrch->utilIter; 1.307 + 1.308 + if (coleiter == NULL) { 1.309 + coleiter = ucol_openElements(strsrch->collator, pattern->text, 1.310 + patternlength, status); 1.311 + // status will be checked in ucol_next(..) later and if it is an 1.312 + // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 1.313 + // returned. 1.314 + strsrch->utilIter = coleiter; 1.315 + } 1.316 + else { 1.317 + uprv_init_collIterate(strsrch->collator, pattern->text, 1.318 + pattern->textLength, 1.319 + &coleiter->iteratordata_, 1.320 + status); 1.321 + } 1.322 + if(U_FAILURE(*status)) { 1.323 + return 0; 1.324 + } 1.325 + 1.326 + if (pattern->CE != cetable && pattern->CE) { 1.327 + uprv_free(pattern->CE); 1.328 + } 1.329 + 1.330 + uint16_t offset = 0; 1.331 + uint16_t result = 0; 1.332 + int32_t ce; 1.333 + 1.334 + while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 1.335 + U_SUCCESS(*status)) { 1.336 + uint32_t newce = getCE(strsrch, ce); 1.337 + if (newce) { 1.338 + int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 1.339 + newce, 1.340 + patternlength - ucol_getOffset(coleiter) + 1, 1.341 + status); 1.342 + if (U_FAILURE(*status)) { 1.343 + return 0; 1.344 + } 1.345 + offset ++; 1.346 + if (cetable != temp && cetable != pattern->CEBuffer) { 1.347 + uprv_free(cetable); 1.348 + } 1.349 + cetable = temp; 1.350 + } 1.351 + result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 1.352 + } 1.353 + 1.354 + cetable[offset] = 0; 1.355 + pattern->CE = cetable; 1.356 + pattern->CELength = offset; 1.357 + 1.358 + return result; 1.359 +} 1.360 + 1.361 +/** 1.362 +* Initializing the pce table for a pattern. 1.363 +* Stores non-ignorable collation keys. 1.364 +* Table size will be estimated by the size of the pattern text. Table 1.365 +* expansion will be perform as we go along. Adding 1 to ensure that the table 1.366 +* size definitely increases. 1.367 +* Internal method, status assumed to be a success. 1.368 +* @param strsrch string search data 1.369 +* @param status output error if any, caller to check status before calling 1.370 +* method, status assumed to be success when passed in. 1.371 +* @return total number of expansions 1.372 +*/ 1.373 +static 1.374 +inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 1.375 + UErrorCode *status) 1.376 +{ 1.377 + UPattern *pattern = &(strsrch->pattern); 1.378 + uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 1.379 + int64_t *pcetable = pattern->PCEBuffer; 1.380 + uint32_t patternlength = pattern->textLength; 1.381 + UCollationElements *coleiter = strsrch->utilIter; 1.382 + 1.383 + if (coleiter == NULL) { 1.384 + coleiter = ucol_openElements(strsrch->collator, pattern->text, 1.385 + patternlength, status); 1.386 + // status will be checked in ucol_next(..) later and if it is an 1.387 + // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 1.388 + // returned. 1.389 + strsrch->utilIter = coleiter; 1.390 + } else { 1.391 + uprv_init_collIterate(strsrch->collator, pattern->text, 1.392 + pattern->textLength, 1.393 + &coleiter->iteratordata_, 1.394 + status); 1.395 + } 1.396 + if(U_FAILURE(*status)) { 1.397 + return 0; 1.398 + } 1.399 + 1.400 + if (pattern->PCE != pcetable && pattern->PCE != NULL) { 1.401 + uprv_free(pattern->PCE); 1.402 + } 1.403 + 1.404 + uint16_t offset = 0; 1.405 + uint16_t result = 0; 1.406 + int64_t pce; 1.407 + 1.408 + uprv_init_pce(coleiter); 1.409 + 1.410 + // ** Should processed CEs be signed or unsigned? 1.411 + // ** (the rest of the code in this file seems to play fast-and-loose with 1.412 + // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 1.413 + while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 1.414 + U_SUCCESS(*status)) { 1.415 + int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 1.416 + pce, 1.417 + patternlength - ucol_getOffset(coleiter) + 1, 1.418 + status); 1.419 + 1.420 + if (U_FAILURE(*status)) { 1.421 + return 0; 1.422 + } 1.423 + 1.424 + offset += 1; 1.425 + 1.426 + if (pcetable != temp && pcetable != pattern->PCEBuffer) { 1.427 + uprv_free(pcetable); 1.428 + } 1.429 + 1.430 + pcetable = temp; 1.431 + //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 1.432 + } 1.433 + 1.434 + pcetable[offset] = 0; 1.435 + pattern->PCE = pcetable; 1.436 + pattern->PCELength = offset; 1.437 + 1.438 + return result; 1.439 +} 1.440 + 1.441 +/** 1.442 +* Initializes the pattern struct. 1.443 +* Internal method, status assumed to be success. 1.444 +* @param strsrch UStringSearch data storage 1.445 +* @param status output error if any, caller to check status before calling 1.446 +* method, status assumed to be success when passed in. 1.447 +* @return expansionsize the total expansion size of the pattern 1.448 +*/ 1.449 +static 1.450 +inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 1.451 +{ 1.452 + UPattern *pattern = &(strsrch->pattern); 1.453 + const UChar *patterntext = pattern->text; 1.454 + int32_t length = pattern->textLength; 1.455 + int32_t index = 0; 1.456 + 1.457 + // Since the strength is primary, accents are ignored in the pattern. 1.458 + if (strsrch->strength == UCOL_PRIMARY) { 1.459 + pattern->hasPrefixAccents = 0; 1.460 + pattern->hasSuffixAccents = 0; 1.461 + } else { 1.462 + pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 1.463 + SECOND_LAST_BYTE_SHIFT_; 1.464 + index = length; 1.465 + U16_BACK_1(patterntext, 0, index); 1.466 + pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 1.467 + LAST_BYTE_MASK_; 1.468 + } 1.469 + 1.470 + // ** HACK ** 1.471 + if (strsrch->pattern.PCE != NULL) { 1.472 + if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 1.473 + uprv_free(strsrch->pattern.PCE); 1.474 + } 1.475 + 1.476 + strsrch->pattern.PCE = NULL; 1.477 + } 1.478 + 1.479 + // since intializePattern is an internal method status is a success. 1.480 + return initializePatternCETable(strsrch, status); 1.481 +} 1.482 + 1.483 +/** 1.484 +* Initializing shift tables, with the default values. 1.485 +* If a corresponding default value is 0, the shift table is not set. 1.486 +* @param shift table for forwards shift 1.487 +* @param backshift table for backwards shift 1.488 +* @param cetable table containing pattern ce 1.489 +* @param cesize size of the pattern ces 1.490 +* @param expansionsize total size of the expansions 1.491 +* @param defaultforward the default forward value 1.492 +* @param defaultbackward the default backward value 1.493 +*/ 1.494 +static 1.495 +inline void setShiftTable(int16_t shift[], int16_t backshift[], 1.496 + int32_t *cetable, int32_t cesize, 1.497 + int16_t expansionsize, 1.498 + int16_t defaultforward, 1.499 + int16_t defaultbackward) 1.500 +{ 1.501 + // estimate the value to shift. to do that we estimate the smallest 1.502 + // number of characters to give the relevant ces, ie approximately 1.503 + // the number of ces minus their expansion, since expansions can come 1.504 + // from a character. 1.505 + int32_t count; 1.506 + for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 1.507 + shift[count] = defaultforward; 1.508 + } 1.509 + cesize --; // down to the last index 1.510 + for (count = 0; count < cesize; count ++) { 1.511 + // number of ces from right of array to the count 1.512 + int temp = defaultforward - count - 1; 1.513 + shift[hash(cetable[count])] = temp > 1 ? temp : 1; 1.514 + } 1.515 + shift[hash(cetable[cesize])] = 1; 1.516 + // for ignorables we just shift by one. see test examples. 1.517 + shift[hash(0)] = 1; 1.518 + 1.519 + for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 1.520 + backshift[count] = defaultbackward; 1.521 + } 1.522 + for (count = cesize; count > 0; count --) { 1.523 + // the original value count does not seem to work 1.524 + backshift[hash(cetable[count])] = count > expansionsize ? 1.525 + (int16_t)(count - expansionsize) : 1; 1.526 + } 1.527 + backshift[hash(cetable[0])] = 1; 1.528 + backshift[hash(0)] = 1; 1.529 +} 1.530 + 1.531 +/** 1.532 +* Building of the pattern collation element list and the boyer moore strsrch 1.533 +* table. 1.534 +* The canonical match will only be performed after the default match fails. 1.535 +* For both cases we need to remember the size of the composed and decomposed 1.536 +* versions of the string. Since the Boyer-Moore shift calculations shifts by 1.537 +* a number of characters in the text and tries to match the pattern from that 1.538 +* offset, the shift value can not be too large in case we miss some 1.539 +* characters. To choose a right shift size, we estimate the NFC form of the 1.540 +* and use its size as a shift guide. The NFC form should be the small 1.541 +* possible representation of the pattern. Anyways, we'll err on the smaller 1.542 +* shift size. Hence the calculation for minlength. 1.543 +* Canonical match will be performed slightly differently. We'll split the 1.544 +* pattern into 3 parts, the prefix accents (PA), the middle string bounded by 1.545 +* the first and last base character (MS), the ending accents (EA). Matches 1.546 +* will be done on MS first, and only when we match MS then some processing 1.547 +* will be required for the prefix and end accents in order to determine if 1.548 +* they match PA and EA. Hence the default shift values 1.549 +* for the canonical match will take the size of either end's accent into 1.550 +* consideration. Forwards search will take the end accents into consideration 1.551 +* for the default shift values and the backwards search will take the prefix 1.552 +* accents into consideration. 1.553 +* If pattern has no non-ignorable ce, we return a illegal argument error. 1.554 +* Internal method, status assumed to be success. 1.555 +* @param strsrch UStringSearch data storage 1.556 +* @param status for output errors if it occurs, status is assumed to be a 1.557 +* success when it is passed in. 1.558 +*/ 1.559 +static 1.560 +inline void initialize(UStringSearch *strsrch, UErrorCode *status) 1.561 +{ 1.562 + int16_t expandlength = initializePattern(strsrch, status); 1.563 + if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { 1.564 + UPattern *pattern = &strsrch->pattern; 1.565 + int32_t cesize = pattern->CELength; 1.566 + 1.567 + int16_t minlength = cesize > expandlength 1.568 + ? (int16_t)cesize - expandlength : 1; 1.569 + pattern->defaultShiftSize = minlength; 1.570 + setShiftTable(pattern->shift, pattern->backShift, pattern->CE, 1.571 + cesize, expandlength, minlength, minlength); 1.572 + return; 1.573 + } 1.574 + strsrch->pattern.defaultShiftSize = 0; 1.575 +} 1.576 + 1.577 +#if BOYER_MOORE 1.578 +/** 1.579 +* Check to make sure that the match length is at the end of the character by 1.580 +* using the breakiterator. 1.581 +* @param strsrch string search data 1.582 +* @param start target text start offset 1.583 +* @param end target text end offset 1.584 +*/ 1.585 +static 1.586 +void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 1.587 + int32_t *end) 1.588 +{ 1.589 +#if !UCONFIG_NO_BREAK_ITERATION 1.590 + UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 1.591 + if (breakiterator) { 1.592 + int32_t matchend = *end; 1.593 + //int32_t matchstart = *start; 1.594 + 1.595 + if (!ubrk_isBoundary(breakiterator, matchend)) { 1.596 + *end = ubrk_following(breakiterator, matchend); 1.597 + } 1.598 + 1.599 + /* Check the start of the matched text to make sure it doesn't have any accents 1.600 + * before it. This code may not be necessary and so it is commented out */ 1.601 + /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 1.602 + *start = ubrk_preceding(breakiterator, matchstart); 1.603 + }*/ 1.604 + } 1.605 +#endif 1.606 +} 1.607 + 1.608 +/** 1.609 +* Determine whether the target text in UStringSearch bounded by the offset 1.610 +* start and end is one or more whole units of text as 1.611 +* determined by the breakiterator in UStringSearch. 1.612 +* @param strsrch string search data 1.613 +* @param start target text start offset 1.614 +* @param end target text end offset 1.615 +*/ 1.616 +static 1.617 +UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 1.618 + int32_t end) 1.619 +{ 1.620 +#if !UCONFIG_NO_BREAK_ITERATION 1.621 + UBreakIterator *breakiterator = strsrch->search->breakIter; 1.622 + //TODO: Add here. 1.623 + if (breakiterator) { 1.624 + int32_t startindex = ubrk_first(breakiterator); 1.625 + int32_t endindex = ubrk_last(breakiterator); 1.626 + 1.627 + // out-of-range indexes are never boundary positions 1.628 + if (start < startindex || start > endindex || 1.629 + end < startindex || end > endindex) { 1.630 + return FALSE; 1.631 + } 1.632 + // otherwise, we can use following() on the position before the 1.633 + // specified one and return true of the position we get back is the 1.634 + // one the user specified 1.635 + UBool result = (start == startindex || 1.636 + ubrk_following(breakiterator, start - 1) == start) && 1.637 + (end == endindex || 1.638 + ubrk_following(breakiterator, end - 1) == end); 1.639 + if (result) { 1.640 + // iterates the individual ces 1.641 + UCollationElements *coleiter = strsrch->utilIter; 1.642 + const UChar *text = strsrch->search->text + 1.643 + start; 1.644 + UErrorCode status = U_ZERO_ERROR; 1.645 + ucol_setText(coleiter, text, end - start, &status); 1.646 + for (int32_t count = 0; count < strsrch->pattern.CELength; 1.647 + count ++) { 1.648 + int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1.649 + if (ce == UCOL_IGNORABLE) { 1.650 + count --; 1.651 + continue; 1.652 + } 1.653 + if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { 1.654 + return FALSE; 1.655 + } 1.656 + } 1.657 + int32_t nextce = ucol_next(coleiter, &status); 1.658 + while (ucol_getOffset(coleiter) == (end - start) 1.659 + && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 1.660 + nextce = ucol_next(coleiter, &status); 1.661 + } 1.662 + if (ucol_getOffset(coleiter) == (end - start) 1.663 + && nextce != UCOL_NULLORDER) { 1.664 + // extra collation elements at the end of the match 1.665 + return FALSE; 1.666 + } 1.667 + } 1.668 + return result; 1.669 + } 1.670 +#endif 1.671 + return TRUE; 1.672 +} 1.673 + 1.674 +/** 1.675 +* Getting the next base character offset if current offset is an accent, 1.676 +* or the current offset if the current character contains a base character. 1.677 +* accents the following base character will be returned 1.678 +* @param text string 1.679 +* @param textoffset current offset 1.680 +* @param textlength length of text string 1.681 +* @return the next base character or the current offset 1.682 +* if the current character is contains a base character. 1.683 +*/ 1.684 +static 1.685 +inline int32_t getNextBaseOffset(const UChar *text, 1.686 + int32_t textoffset, 1.687 + int32_t textlength) 1.688 +{ 1.689 + if (textoffset < textlength) { 1.690 + int32_t temp = textoffset; 1.691 + if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1.692 + while (temp < textlength) { 1.693 + int32_t result = temp; 1.694 + if ((getFCD(text, &temp, textlength) >> 1.695 + SECOND_LAST_BYTE_SHIFT_) == 0) { 1.696 + return result; 1.697 + } 1.698 + } 1.699 + return textlength; 1.700 + } 1.701 + } 1.702 + return textoffset; 1.703 +} 1.704 + 1.705 +/** 1.706 +* Gets the next base character offset depending on the string search pattern 1.707 +* data 1.708 +* @param strsrch string search data 1.709 +* @param textoffset current offset, one offset away from the last character 1.710 +* to search for. 1.711 +* @return start index of the next base character or the current offset 1.712 +* if the current character is contains a base character. 1.713 +*/ 1.714 +static 1.715 +inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 1.716 + int32_t textoffset) 1.717 +{ 1.718 + int32_t textlength = strsrch->search->textLength; 1.719 + if (strsrch->pattern.hasSuffixAccents && 1.720 + textoffset < textlength) { 1.721 + int32_t temp = textoffset; 1.722 + const UChar *text = strsrch->search->text; 1.723 + U16_BACK_1(text, 0, temp); 1.724 + if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1.725 + return getNextBaseOffset(text, textoffset, textlength); 1.726 + } 1.727 + } 1.728 + return textoffset; 1.729 +} 1.730 + 1.731 +/** 1.732 +* Shifting the collation element iterator position forward to prepare for 1.733 +* a following match. If the last character is a unsafe character, we'll only 1.734 +* shift by 1 to capture contractions, normalization etc. 1.735 +* Internal method, status assumed to be success. 1.736 +* @param text strsrch string search data 1.737 +* @param textoffset start text position to do search 1.738 +* @param ce the text ce which failed the match. 1.739 +* @param patternceindex index of the ce within the pattern ce buffer which 1.740 +* failed the match 1.741 +* @return final offset 1.742 +*/ 1.743 +static 1.744 +inline int32_t shiftForward(UStringSearch *strsrch, 1.745 + int32_t textoffset, 1.746 + int32_t ce, 1.747 + int32_t patternceindex) 1.748 +{ 1.749 + UPattern *pattern = &(strsrch->pattern); 1.750 + if (ce != UCOL_NULLORDER) { 1.751 + int32_t shift = pattern->shift[hash(ce)]; 1.752 + // this is to adjust for characters in the middle of the 1.753 + // substring for matching that failed. 1.754 + int32_t adjust = pattern->CELength - patternceindex; 1.755 + if (adjust > 1 && shift >= adjust) { 1.756 + shift -= adjust - 1; 1.757 + } 1.758 + textoffset += shift; 1.759 + } 1.760 + else { 1.761 + textoffset += pattern->defaultShiftSize; 1.762 + } 1.763 + 1.764 + textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 1.765 + // check for unsafe characters 1.766 + // * if it is the start or middle of a contraction: to be done after 1.767 + // a initial match is found 1.768 + // * thai or lao base consonant character: similar to contraction 1.769 + // * high surrogate character: similar to contraction 1.770 + // * next character is a accent: shift to the next base character 1.771 + return textoffset; 1.772 +} 1.773 +#endif // #if BOYER_MOORE 1.774 + 1.775 +/** 1.776 +* sets match not found 1.777 +* @param strsrch string search data 1.778 +*/ 1.779 +static 1.780 +inline void setMatchNotFound(UStringSearch *strsrch) 1.781 +{ 1.782 + // this method resets the match result regardless of the error status. 1.783 + strsrch->search->matchedIndex = USEARCH_DONE; 1.784 + strsrch->search->matchedLength = 0; 1.785 + if (strsrch->search->isForwardSearching) { 1.786 + setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 1.787 + } 1.788 + else { 1.789 + setColEIterOffset(strsrch->textIter, 0); 1.790 + } 1.791 +} 1.792 + 1.793 +#if BOYER_MOORE 1.794 +/** 1.795 +* Gets the offset to the next safe point in text. 1.796 +* ie. not the middle of a contraction, swappable characters or supplementary 1.797 +* characters. 1.798 +* @param collator collation sata 1.799 +* @param text string to work with 1.800 +* @param textoffset offset in string 1.801 +* @param textlength length of text string 1.802 +* @return offset to the next safe character 1.803 +*/ 1.804 +static 1.805 +inline int32_t getNextSafeOffset(const UCollator *collator, 1.806 + const UChar *text, 1.807 + int32_t textoffset, 1.808 + int32_t textlength) 1.809 +{ 1.810 + int32_t result = textoffset; // first contraction character 1.811 + while (result != textlength && ucol_unsafeCP(text[result], collator)) { 1.812 + result ++; 1.813 + } 1.814 + return result; 1.815 +} 1.816 + 1.817 +/** 1.818 +* This checks for accents in the potential match started with a . 1.819 +* composite character. 1.820 +* This is really painful... we have to check that composite character do not 1.821 +* have any extra accents. We have to normalize the potential match and find 1.822 +* the immediate decomposed character before the match. 1.823 +* The first composite character would have been taken care of by the fcd 1.824 +* checks in checkForwardExactMatch. 1.825 +* This is the slow path after the fcd of the first character and 1.826 +* the last character has been checked by checkForwardExactMatch and we 1.827 +* determine that the potential match has extra non-ignorable preceding 1.828 +* ces. 1.829 +* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 1.830 +* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 1.831 +* Note here that accents checking are slow and cautioned in the API docs. 1.832 +* Internal method, status assumed to be a success, caller should check status 1.833 +* before calling this method 1.834 +* @param strsrch string search data 1.835 +* @param start index of the potential unfriendly composite character 1.836 +* @param end index of the potential unfriendly composite character 1.837 +* @param status output error status if any. 1.838 +* @return TRUE if there is non-ignorable accents before at the beginning 1.839 +* of the match, FALSE otherwise. 1.840 +*/ 1.841 + 1.842 +static 1.843 +UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 1.844 + int32_t end, 1.845 + UErrorCode *status) 1.846 +{ 1.847 + UBool result = FALSE; 1.848 + if (strsrch->pattern.hasPrefixAccents) { 1.849 + int32_t length = end - start; 1.850 + int32_t offset = 0; 1.851 + const UChar *text = strsrch->search->text + start; 1.852 + 1.853 + U16_FWD_1(text, offset, length); 1.854 + // we are only concerned with the first composite character 1.855 + if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 1.856 + int32_t safeoffset = getNextSafeOffset(strsrch->collator, 1.857 + text, 0, length); 1.858 + if (safeoffset != length) { 1.859 + safeoffset ++; 1.860 + } 1.861 + UChar *norm = NULL; 1.862 + UChar buffer[INITIAL_ARRAY_SIZE_]; 1.863 + int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 1.864 + buffer, INITIAL_ARRAY_SIZE_, 1.865 + status); 1.866 + if (U_FAILURE(*status)) { 1.867 + return FALSE; 1.868 + } 1.869 + if (size >= INITIAL_ARRAY_SIZE_) { 1.870 + norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 1.871 + status); 1.872 + // if allocation failed, status will be set to 1.873 + // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 1.874 + // checks for it. 1.875 + size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 1.876 + size, status); 1.877 + if (U_FAILURE(*status) && norm != NULL) { 1.878 + uprv_free(norm); 1.879 + return FALSE; 1.880 + } 1.881 + } 1.882 + else { 1.883 + norm = buffer; 1.884 + } 1.885 + 1.886 + UCollationElements *coleiter = strsrch->utilIter; 1.887 + ucol_setText(coleiter, norm, size, status); 1.888 + uint32_t firstce = strsrch->pattern.CE[0]; 1.889 + UBool ignorable = TRUE; 1.890 + uint32_t ce = UCOL_IGNORABLE; 1.891 + while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 1.892 + offset = ucol_getOffset(coleiter); 1.893 + if (ce != firstce && ce != UCOL_IGNORABLE) { 1.894 + ignorable = FALSE; 1.895 + } 1.896 + ce = ucol_next(coleiter, status); 1.897 + } 1.898 + UChar32 codepoint; 1.899 + U16_PREV(norm, 0, offset, codepoint); 1.900 + result = !ignorable && (u_getCombiningClass(codepoint) != 0); 1.901 + 1.902 + if (norm != buffer) { 1.903 + uprv_free(norm); 1.904 + } 1.905 + } 1.906 + } 1.907 + 1.908 + return result; 1.909 +} 1.910 + 1.911 +/** 1.912 +* Used by exact matches, checks if there are accents before the match. 1.913 +* This is really painful... we have to check that composite characters at 1.914 +* the start of the matches have to not have any extra accents. 1.915 +* We check the FCD of the character first, if it starts with an accent and 1.916 +* the first pattern ce does not match the first ce of the character, we bail. 1.917 +* Otherwise we try normalizing the first composite 1.918 +* character and find the immediate decomposed character before the match to 1.919 +* see if it is an non-ignorable accent. 1.920 +* Now normalizing the first composite character is enough because we ensure 1.921 +* that when the match is passed in here with extra beginning ces, the 1.922 +* first or last ce that match has to occur within the first character. 1.923 +* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 1.924 +* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 1.925 +* Note here that accents checking are slow and cautioned in the API docs. 1.926 +* @param strsrch string search data 1.927 +* @param start offset 1.928 +* @param end offset 1.929 +* @return TRUE if there are accents on either side of the match, 1.930 +* FALSE otherwise 1.931 +*/ 1.932 +static 1.933 +UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 1.934 + int32_t end) 1.935 +{ 1.936 + if (strsrch->pattern.hasPrefixAccents) { 1.937 + UCollationElements *coleiter = strsrch->textIter; 1.938 + UErrorCode status = U_ZERO_ERROR; 1.939 + // we have been iterating forwards previously 1.940 + uint32_t ignorable = TRUE; 1.941 + int32_t firstce = strsrch->pattern.CE[0]; 1.942 + 1.943 + setColEIterOffset(coleiter, start); 1.944 + int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1.945 + if (U_FAILURE(status)) { 1.946 + return TRUE; 1.947 + } 1.948 + while (ce != firstce) { 1.949 + if (ce != UCOL_IGNORABLE) { 1.950 + ignorable = FALSE; 1.951 + } 1.952 + ce = getCE(strsrch, ucol_next(coleiter, &status)); 1.953 + if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1.954 + return TRUE; 1.955 + } 1.956 + } 1.957 + if (!ignorable && inNormBuf(coleiter)) { 1.958 + // within normalization buffer, discontiguous handled here 1.959 + return TRUE; 1.960 + } 1.961 + 1.962 + // within text 1.963 + int32_t temp = start; 1.964 + // original code 1.965 + // accent = (getFCD(strsrch->search->text, &temp, 1.966 + // strsrch->search->textLength) 1.967 + // >> SECOND_LAST_BYTE_SHIFT_); 1.968 + // however this code does not work well with VC7 .net in release mode. 1.969 + // maybe the inlines for getFCD combined with shifting has bugs in 1.970 + // VC7. anyways this is a work around. 1.971 + UBool accent = getFCD(strsrch->search->text, &temp, 1.972 + strsrch->search->textLength) > 0xFF; 1.973 + if (!accent) { 1.974 + return checkExtraMatchAccents(strsrch, start, end, &status); 1.975 + } 1.976 + if (!ignorable) { 1.977 + return TRUE; 1.978 + } 1.979 + if (start > 0) { 1.980 + temp = start; 1.981 + U16_BACK_1(strsrch->search->text, 0, temp); 1.982 + if (getFCD(strsrch->search->text, &temp, 1.983 + strsrch->search->textLength) & LAST_BYTE_MASK_) { 1.984 + setColEIterOffset(coleiter, start); 1.985 + ce = ucol_previous(coleiter, &status); 1.986 + if (U_FAILURE(status) || 1.987 + (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 1.988 + return TRUE; 1.989 + } 1.990 + } 1.991 + } 1.992 + } 1.993 + 1.994 + return FALSE; 1.995 +} 1.996 + 1.997 +/** 1.998 +* Used by exact matches, checks if there are accents bounding the match. 1.999 +* Note this is the initial boundary check. If the potential match 1.1000 +* starts or ends with composite characters, the accents in those 1.1001 +* characters will be determined later. 1.1002 +* Not doing backwards iteration here, since discontiguos contraction for 1.1003 +* backwards collation element iterator, use up too many characters. 1.1004 +* E.g. looking for \u030A ring in \u01FA A ring above and acute, 1.1005 +* should fail since there is a acute at the end of \u01FA 1.1006 +* Note here that accents checking are slow and cautioned in the API docs. 1.1007 +* @param strsrch string search data 1.1008 +* @param start offset of match 1.1009 +* @param end end offset of the match 1.1010 +* @return TRUE if there are accents on either side of the match, 1.1011 +* FALSE otherwise 1.1012 +*/ 1.1013 +static 1.1014 +UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 1.1015 + int32_t end) 1.1016 +{ 1.1017 + if (strsrch->pattern.hasSuffixAccents) { 1.1018 + const UChar *text = strsrch->search->text; 1.1019 + int32_t temp = end; 1.1020 + int32_t textlength = strsrch->search->textLength; 1.1021 + U16_BACK_1(text, 0, temp); 1.1022 + if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1.1023 + int32_t firstce = strsrch->pattern.CE[0]; 1.1024 + UCollationElements *coleiter = strsrch->textIter; 1.1025 + UErrorCode status = U_ZERO_ERROR; 1.1026 + int32_t ce; 1.1027 + setColEIterOffset(coleiter, start); 1.1028 + while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1.1029 + if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1.1030 + return TRUE; 1.1031 + } 1.1032 + } 1.1033 + int32_t count = 1; 1.1034 + while (count < strsrch->pattern.CELength) { 1.1035 + if (getCE(strsrch, ucol_next(coleiter, &status)) 1.1036 + == UCOL_IGNORABLE) { 1.1037 + // Thai can give an ignorable here. 1.1038 + count --; 1.1039 + } 1.1040 + if (U_FAILURE(status)) { 1.1041 + return TRUE; 1.1042 + } 1.1043 + count ++; 1.1044 + } 1.1045 + 1.1046 + ce = ucol_next(coleiter, &status); 1.1047 + if (U_FAILURE(status)) { 1.1048 + return TRUE; 1.1049 + } 1.1050 + if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1.1051 + ce = getCE(strsrch, ce); 1.1052 + } 1.1053 + if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1.1054 + if (ucol_getOffset(coleiter) <= end) { 1.1055 + return TRUE; 1.1056 + } 1.1057 + if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1.1058 + return TRUE; 1.1059 + } 1.1060 + } 1.1061 + } 1.1062 + } 1.1063 + return FALSE; 1.1064 +} 1.1065 +#endif // #if BOYER_MOORE 1.1066 + 1.1067 +/** 1.1068 +* Checks if the offset runs out of the text string 1.1069 +* @param offset 1.1070 +* @param textlength of the text string 1.1071 +* @return TRUE if offset is out of bounds, FALSE otherwise 1.1072 +*/ 1.1073 +static 1.1074 +inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1.1075 +{ 1.1076 + return offset < 0 || offset > textlength; 1.1077 +} 1.1078 + 1.1079 +/** 1.1080 +* Checks for identical match 1.1081 +* @param strsrch string search data 1.1082 +* @param start offset of possible match 1.1083 +* @param end offset of possible match 1.1084 +* @return TRUE if identical match is found 1.1085 +*/ 1.1086 +static 1.1087 +inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1.1088 + int32_t end) 1.1089 +{ 1.1090 + if (strsrch->strength != UCOL_IDENTICAL) { 1.1091 + return TRUE; 1.1092 + } 1.1093 + 1.1094 + // Note: We could use Normalizer::compare() or similar, but for short strings 1.1095 + // which may not be in FCD it might be faster to just NFD them. 1.1096 + UErrorCode status = U_ZERO_ERROR; 1.1097 + UnicodeString t2, p2; 1.1098 + strsrch->nfd->normalize( 1.1099 + UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 1.1100 + strsrch->nfd->normalize( 1.1101 + UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1.1102 + // return FALSE if NFD failed 1.1103 + return U_SUCCESS(status) && t2 == p2; 1.1104 +} 1.1105 + 1.1106 +#if BOYER_MOORE 1.1107 +/** 1.1108 +* Checks to see if the match is repeated 1.1109 +* @param strsrch string search data 1.1110 +* @param start new match start index 1.1111 +* @param end new match end index 1.1112 +* @return TRUE if the the match is repeated, FALSE otherwise 1.1113 +*/ 1.1114 +static 1.1115 +inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1.1116 + int32_t start, 1.1117 + int32_t end) 1.1118 +{ 1.1119 + int32_t lastmatchindex = strsrch->search->matchedIndex; 1.1120 + UBool result; 1.1121 + if (lastmatchindex == USEARCH_DONE) { 1.1122 + return FALSE; 1.1123 + } 1.1124 + if (strsrch->search->isForwardSearching) { 1.1125 + result = start <= lastmatchindex; 1.1126 + } 1.1127 + else { 1.1128 + result = start >= lastmatchindex; 1.1129 + } 1.1130 + if (!result && !strsrch->search->isOverlap) { 1.1131 + if (strsrch->search->isForwardSearching) { 1.1132 + result = start < lastmatchindex + strsrch->search->matchedLength; 1.1133 + } 1.1134 + else { 1.1135 + result = end > lastmatchindex; 1.1136 + } 1.1137 + } 1.1138 + return result; 1.1139 +} 1.1140 + 1.1141 +/** 1.1142 +* Gets the collation element iterator's current offset. 1.1143 +* @param coleiter collation element iterator 1.1144 +* @param forwards flag TRUE if we are moving in th forwards direction 1.1145 +* @return current offset 1.1146 +*/ 1.1147 +static 1.1148 +inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1.1149 + UBool forwards) 1.1150 +{ 1.1151 + int32_t result = ucol_getOffset(coleiter); 1.1152 + // intricacies of the the backwards collation element iterator 1.1153 + if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1.1154 + result ++; 1.1155 + } 1.1156 + return result; 1.1157 +} 1.1158 + 1.1159 +/** 1.1160 +* Checks match for contraction. 1.1161 +* If the match ends with a partial contraction we fail. 1.1162 +* If the match starts too far off (because of backwards iteration) we try to 1.1163 +* chip off the extra characters depending on whether a breakiterator has 1.1164 +* been used. 1.1165 +* Internal method, error assumed to be success, caller has to check status 1.1166 +* before calling this method. 1.1167 +* @param strsrch string search data 1.1168 +* @param start offset of potential match, to be modified if necessary 1.1169 +* @param end offset of potential match, to be modified if necessary 1.1170 +* @param status output error status if any 1.1171 +* @return TRUE if match passes the contraction test, FALSE otherwise 1.1172 +*/ 1.1173 + 1.1174 +static 1.1175 +UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1.1176 + int32_t *start, 1.1177 + int32_t *end, UErrorCode *status) 1.1178 +{ 1.1179 + UCollationElements *coleiter = strsrch->textIter; 1.1180 + int32_t textlength = strsrch->search->textLength; 1.1181 + int32_t temp = *start; 1.1182 + const UCollator *collator = strsrch->collator; 1.1183 + const UChar *text = strsrch->search->text; 1.1184 + // This part checks if either ends of the match contains potential 1.1185 + // contraction. If so we'll have to iterate through them 1.1186 + // The start contraction needs to be checked since ucol_previous dumps 1.1187 + // all characters till the first safe character into the buffer. 1.1188 + // *start + 1 is used to test for the unsafe characters instead of *start 1.1189 + // because ucol_prev takes all unsafe characters till the first safe 1.1190 + // character ie *start. so by testing *start + 1, we can estimate if 1.1191 + // excess prefix characters has been included in the potential search 1.1192 + // results. 1.1193 + if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1.1194 + (*start + 1 < textlength 1.1195 + && ucol_unsafeCP(text[*start + 1], collator))) { 1.1196 + int32_t expansion = getExpansionPrefix(coleiter); 1.1197 + UBool expandflag = expansion > 0; 1.1198 + setColEIterOffset(coleiter, *start); 1.1199 + while (expansion > 0) { 1.1200 + // getting rid of the redundant ce, caused by setOffset. 1.1201 + // since backward contraction/expansion may have extra ces if we 1.1202 + // are in the normalization buffer, hasAccentsBeforeMatch would 1.1203 + // have taken care of it. 1.1204 + // E.g. the character \u01FA will have an expansion of 3, but if 1.1205 + // we are only looking for acute and ring \u030A and \u0301, we'll 1.1206 + // have to skip the first ce in the expansion buffer. 1.1207 + ucol_next(coleiter, status); 1.1208 + if (U_FAILURE(*status)) { 1.1209 + return FALSE; 1.1210 + } 1.1211 + if (ucol_getOffset(coleiter) != temp) { 1.1212 + *start = temp; 1.1213 + temp = ucol_getOffset(coleiter); 1.1214 + } 1.1215 + expansion --; 1.1216 + } 1.1217 + 1.1218 + int32_t *patternce = strsrch->pattern.CE; 1.1219 + int32_t patterncelength = strsrch->pattern.CELength; 1.1220 + int32_t count = 0; 1.1221 + while (count < patterncelength) { 1.1222 + int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1.1223 + if (ce == UCOL_IGNORABLE) { 1.1224 + continue; 1.1225 + } 1.1226 + if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1.1227 + *start = temp; 1.1228 + temp = ucol_getOffset(coleiter); 1.1229 + } 1.1230 + if (U_FAILURE(*status) || ce != patternce[count]) { 1.1231 + (*end) ++; 1.1232 + *end = getNextUStringSearchBaseOffset(strsrch, *end); 1.1233 + return FALSE; 1.1234 + } 1.1235 + count ++; 1.1236 + } 1.1237 + } 1.1238 + return TRUE; 1.1239 +} 1.1240 + 1.1241 +/** 1.1242 +* Checks and sets the match information if found. 1.1243 +* Checks 1.1244 +* <ul> 1.1245 +* <li> the potential match does not repeat the previous match 1.1246 +* <li> boundaries are correct 1.1247 +* <li> exact matches has no extra accents 1.1248 +* <li> identical matchesb 1.1249 +* <li> potential match does not end in the middle of a contraction 1.1250 +* <\ul> 1.1251 +* Otherwise the offset will be shifted to the next character. 1.1252 +* Internal method, status assumed to be success, caller has to check status 1.1253 +* before calling this method. 1.1254 +* @param strsrch string search data 1.1255 +* @param textoffset offset in the collation element text. the returned value 1.1256 +* will be the truncated end offset of the match or the new start 1.1257 +* search offset. 1.1258 +* @param status output error status if any 1.1259 +* @return TRUE if the match is valid, FALSE otherwise 1.1260 +*/ 1.1261 +static 1.1262 +inline UBool checkNextExactMatch(UStringSearch *strsrch, 1.1263 + int32_t *textoffset, UErrorCode *status) 1.1264 +{ 1.1265 + UCollationElements *coleiter = strsrch->textIter; 1.1266 + int32_t start = getColElemIterOffset(coleiter, FALSE); 1.1267 + 1.1268 + if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1.1269 + return FALSE; 1.1270 + } 1.1271 + 1.1272 + // this totally matches, however we need to check if it is repeating 1.1273 + if (!isBreakUnit(strsrch, start, *textoffset) || 1.1274 + checkRepeatedMatch(strsrch, start, *textoffset) || 1.1275 + hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1.1276 + !checkIdentical(strsrch, start, *textoffset) || 1.1277 + hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1.1278 + 1.1279 + (*textoffset) ++; 1.1280 + *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1.1281 + return FALSE; 1.1282 + } 1.1283 + 1.1284 + //Add breakiterator boundary check for primary strength search. 1.1285 + if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1.1286 + checkBreakBoundary(strsrch, &start, textoffset); 1.1287 + } 1.1288 + 1.1289 + // totally match, we will get rid of the ending ignorables. 1.1290 + strsrch->search->matchedIndex = start; 1.1291 + strsrch->search->matchedLength = *textoffset - start; 1.1292 + return TRUE; 1.1293 +} 1.1294 + 1.1295 +/** 1.1296 +* Getting the previous base character offset, or the current offset if the 1.1297 +* current character is a base character 1.1298 +* @param text string 1.1299 +* @param textoffset one offset after the current character 1.1300 +* @return the offset of the next character after the base character or the first 1.1301 +* composed character with accents 1.1302 +*/ 1.1303 +static 1.1304 +inline int32_t getPreviousBaseOffset(const UChar *text, 1.1305 + int32_t textoffset) 1.1306 +{ 1.1307 + if (textoffset > 0) { 1.1308 + for (;;) { 1.1309 + int32_t result = textoffset; 1.1310 + U16_BACK_1(text, 0, textoffset); 1.1311 + int32_t temp = textoffset; 1.1312 + uint16_t fcd = getFCD(text, &temp, result); 1.1313 + if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1.1314 + if (fcd & LAST_BYTE_MASK_) { 1.1315 + return textoffset; 1.1316 + } 1.1317 + return result; 1.1318 + } 1.1319 + if (textoffset == 0) { 1.1320 + return 0; 1.1321 + } 1.1322 + } 1.1323 + } 1.1324 + return textoffset; 1.1325 +} 1.1326 + 1.1327 +/** 1.1328 +* Getting the indexes of the accents that are not blocked in the argument 1.1329 +* accent array 1.1330 +* @param accents array of accents in nfd terminated by a 0. 1.1331 +* @param accentsindex array of indexes of the accents that are not blocked 1.1332 +*/ 1.1333 +static 1.1334 +inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1.1335 +{ 1.1336 + int32_t index = 0; 1.1337 + int32_t length = u_strlen(accents); 1.1338 + UChar32 codepoint = 0; 1.1339 + int cclass = 0; 1.1340 + int result = 0; 1.1341 + int32_t temp; 1.1342 + while (index < length) { 1.1343 + temp = index; 1.1344 + U16_NEXT(accents, index, length, codepoint); 1.1345 + if (u_getCombiningClass(codepoint) != cclass) { 1.1346 + cclass = u_getCombiningClass(codepoint); 1.1347 + accentsindex[result] = temp; 1.1348 + result ++; 1.1349 + } 1.1350 + } 1.1351 + accentsindex[result] = length; 1.1352 + return result; 1.1353 +} 1.1354 + 1.1355 +/** 1.1356 +* Appends 3 UChar arrays to a destination array. 1.1357 +* Creates a new array if we run out of space. The caller will have to 1.1358 +* manually deallocate the newly allocated array. 1.1359 +* Internal method, status assumed to be success, caller has to check status 1.1360 +* before calling this method. destination not to be NULL and has at least 1.1361 +* size destinationlength. 1.1362 +* @param destination target array 1.1363 +* @param destinationlength target array size, returning the appended length 1.1364 +* @param source1 null-terminated first array 1.1365 +* @param source2 second array 1.1366 +* @param source2length length of seond array 1.1367 +* @param source3 null-terminated third array 1.1368 +* @param status error status if any 1.1369 +* @return new destination array, destination if there was no new allocation 1.1370 +*/ 1.1371 +static 1.1372 +inline UChar * addToUCharArray( UChar *destination, 1.1373 + int32_t *destinationlength, 1.1374 + const UChar *source1, 1.1375 + const UChar *source2, 1.1376 + int32_t source2length, 1.1377 + const UChar *source3, 1.1378 + UErrorCode *status) 1.1379 +{ 1.1380 + int32_t source1length = source1 ? u_strlen(source1) : 0; 1.1381 + int32_t source3length = source3 ? u_strlen(source3) : 0; 1.1382 + if (*destinationlength < source1length + source2length + source3length + 1.1383 + 1) 1.1384 + { 1.1385 + destination = (UChar *)allocateMemory( 1.1386 + (source1length + source2length + source3length + 1) * sizeof(UChar), 1.1387 + status); 1.1388 + // if error allocating memory, status will be 1.1389 + // U_MEMORY_ALLOCATION_ERROR 1.1390 + if (U_FAILURE(*status)) { 1.1391 + *destinationlength = 0; 1.1392 + return NULL; 1.1393 + } 1.1394 + } 1.1395 + if (source1length != 0) { 1.1396 + uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1.1397 + } 1.1398 + if (source2length != 0) { 1.1399 + uprv_memcpy(destination + source1length, source2, 1.1400 + sizeof(UChar) * source2length); 1.1401 + } 1.1402 + if (source3length != 0) { 1.1403 + uprv_memcpy(destination + source1length + source2length, source3, 1.1404 + sizeof(UChar) * source3length); 1.1405 + } 1.1406 + *destinationlength = source1length + source2length + source3length; 1.1407 + return destination; 1.1408 +} 1.1409 + 1.1410 +/** 1.1411 +* Running through a collation element iterator to see if the contents matches 1.1412 +* pattern in string search data 1.1413 +* @param strsrch string search data 1.1414 +* @param coleiter collation element iterator 1.1415 +* @return TRUE if a match if found, FALSE otherwise 1.1416 +*/ 1.1417 +static 1.1418 +inline UBool checkCollationMatch(const UStringSearch *strsrch, 1.1419 + UCollationElements *coleiter) 1.1420 +{ 1.1421 + int patternceindex = strsrch->pattern.CELength; 1.1422 + int32_t *patternce = strsrch->pattern.CE; 1.1423 + UErrorCode status = U_ZERO_ERROR; 1.1424 + while (patternceindex > 0) { 1.1425 + int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1.1426 + if (ce == UCOL_IGNORABLE) { 1.1427 + continue; 1.1428 + } 1.1429 + if (U_FAILURE(status) || ce != *patternce) { 1.1430 + return FALSE; 1.1431 + } 1.1432 + patternce ++; 1.1433 + patternceindex --; 1.1434 + } 1.1435 + return TRUE; 1.1436 +} 1.1437 + 1.1438 +/** 1.1439 +* Rearranges the front accents to try matching. 1.1440 +* Prefix accents in the text will be grouped according to their combining 1.1441 +* class and the groups will be mixed and matched to try find the perfect 1.1442 +* match with the pattern. 1.1443 +* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1.1444 +* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1.1445 +* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1.1446 +* "\u0301\u0325". 1.1447 +* step 2: check if any of the generated substrings matches the pattern. 1.1448 +* Internal method, status is assumed to be success, caller has to check status 1.1449 +* before calling this method. 1.1450 +* @param strsrch string search match 1.1451 +* @param start first offset of the accents to start searching 1.1452 +* @param end start of the last accent set 1.1453 +* @param status output error status if any 1.1454 +* @return USEARCH_DONE if a match is not found, otherwise return the starting 1.1455 +* offset of the match. Note this start includes all preceding accents. 1.1456 +*/ 1.1457 +static 1.1458 +int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1.1459 + int32_t start, 1.1460 + int32_t end, 1.1461 + UErrorCode *status) 1.1462 +{ 1.1463 + const UChar *text = strsrch->search->text; 1.1464 + int32_t textlength = strsrch->search->textLength; 1.1465 + int32_t tempstart = start; 1.1466 + 1.1467 + if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1.1468 + // die... failed at a base character 1.1469 + return USEARCH_DONE; 1.1470 + } 1.1471 + 1.1472 + int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1.1473 + start = getPreviousBaseOffset(text, tempstart); 1.1474 + 1.1475 + UChar accents[INITIAL_ARRAY_SIZE_]; 1.1476 + // normalizing the offensive string 1.1477 + unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1.1478 + INITIAL_ARRAY_SIZE_, status); 1.1479 + if (U_FAILURE(*status)) { 1.1480 + return USEARCH_DONE; 1.1481 + } 1.1482 + 1.1483 + int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1.1484 + int32_t accentsize = getUnblockedAccentIndex(accents, 1.1485 + accentsindex); 1.1486 + int32_t count = (2 << (accentsize - 1)) - 1; 1.1487 + UChar buffer[INITIAL_ARRAY_SIZE_]; 1.1488 + UCollationElements *coleiter = strsrch->utilIter; 1.1489 + while (U_SUCCESS(*status) && count > 0) { 1.1490 + UChar *rearrange = strsrch->canonicalPrefixAccents; 1.1491 + // copy the base characters 1.1492 + for (int k = 0; k < accentsindex[0]; k ++) { 1.1493 + *rearrange ++ = accents[k]; 1.1494 + } 1.1495 + // forming all possible canonical rearrangement by dropping 1.1496 + // sets of accents 1.1497 + for (int i = 0; i <= accentsize - 1; i ++) { 1.1498 + int32_t mask = 1 << (accentsize - i - 1); 1.1499 + if (count & mask) { 1.1500 + for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1.1501 + *rearrange ++ = accents[j]; 1.1502 + } 1.1503 + } 1.1504 + } 1.1505 + *rearrange = 0; 1.1506 + int32_t matchsize = INITIAL_ARRAY_SIZE_; 1.1507 + UChar *match = addToUCharArray(buffer, &matchsize, 1.1508 + strsrch->canonicalPrefixAccents, 1.1509 + strsrch->search->text + offset, 1.1510 + end - offset, 1.1511 + strsrch->canonicalSuffixAccents, 1.1512 + status); 1.1513 + 1.1514 + // if status is a failure, ucol_setText does nothing. 1.1515 + // run the collator iterator through this match 1.1516 + ucol_setText(coleiter, match, matchsize, status); 1.1517 + if (U_SUCCESS(*status)) { 1.1518 + if (checkCollationMatch(strsrch, coleiter)) { 1.1519 + if (match != buffer) { 1.1520 + uprv_free(match); 1.1521 + } 1.1522 + return start; 1.1523 + } 1.1524 + } 1.1525 + count --; 1.1526 + } 1.1527 + return USEARCH_DONE; 1.1528 +} 1.1529 + 1.1530 +/** 1.1531 +* Gets the offset to the safe point in text before textoffset. 1.1532 +* ie. not the middle of a contraction, swappable characters or supplementary 1.1533 +* characters. 1.1534 +* @param collator collation sata 1.1535 +* @param text string to work with 1.1536 +* @param textoffset offset in string 1.1537 +* @param textlength length of text string 1.1538 +* @return offset to the previous safe character 1.1539 +*/ 1.1540 +static 1.1541 +inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1.1542 + const UChar *text, 1.1543 + int32_t textoffset) 1.1544 +{ 1.1545 + int32_t result = textoffset; // first contraction character 1.1546 + while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1.1547 + result --; 1.1548 + } 1.1549 + if (result != 0) { 1.1550 + // the first contraction character is consider unsafe here 1.1551 + result --; 1.1552 + } 1.1553 + return result; 1.1554 +} 1.1555 + 1.1556 +/** 1.1557 +* Cleaning up after we passed the safe zone 1.1558 +* @param strsrch string search data 1.1559 +* @param safetext safe text array 1.1560 +* @param safebuffer safe text buffer 1.1561 +* @param coleiter collation element iterator for safe text 1.1562 +*/ 1.1563 +static 1.1564 +inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1.1565 + UChar *safebuffer) 1.1566 +{ 1.1567 + if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1.1568 + { 1.1569 + uprv_free(safetext); 1.1570 + } 1.1571 +} 1.1572 + 1.1573 +/** 1.1574 +* Take the rearranged end accents and tries matching. If match failed at 1.1575 +* a seperate preceding set of accents (seperated from the rearranged on by 1.1576 +* at least a base character) then we rearrange the preceding accents and 1.1577 +* tries matching again. 1.1578 +* We allow skipping of the ends of the accent set if the ces do not match. 1.1579 +* However if the failure is found before the accent set, it fails. 1.1580 +* Internal method, status assumed to be success, caller has to check status 1.1581 +* before calling this method. 1.1582 +* @param strsrch string search data 1.1583 +* @param textoffset of the start of the rearranged accent 1.1584 +* @param status output error status if any 1.1585 +* @return USEARCH_DONE if a match is not found, otherwise return the starting 1.1586 +* offset of the match. Note this start includes all preceding accents. 1.1587 +*/ 1.1588 +static 1.1589 +int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1.1590 + int32_t textoffset, 1.1591 + UErrorCode *status) 1.1592 +{ 1.1593 + const UChar *text = strsrch->search->text; 1.1594 + const UCollator *collator = strsrch->collator; 1.1595 + int32_t safelength = 0; 1.1596 + UChar *safetext; 1.1597 + int32_t safetextlength; 1.1598 + UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1.1599 + UCollationElements *coleiter = strsrch->utilIter; 1.1600 + int32_t safeoffset = textoffset; 1.1601 + 1.1602 + if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1.1603 + collator)) { 1.1604 + safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1.1605 + safelength = textoffset - safeoffset; 1.1606 + safetextlength = INITIAL_ARRAY_SIZE_; 1.1607 + safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1.1608 + text + safeoffset, safelength, 1.1609 + strsrch->canonicalSuffixAccents, 1.1610 + status); 1.1611 + } 1.1612 + else { 1.1613 + safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1.1614 + safetext = strsrch->canonicalSuffixAccents; 1.1615 + } 1.1616 + 1.1617 + // if status is a failure, ucol_setText does nothing 1.1618 + ucol_setText(coleiter, safetext, safetextlength, status); 1.1619 + // status checked in loop below 1.1620 + 1.1621 + int32_t *ce = strsrch->pattern.CE; 1.1622 + int32_t celength = strsrch->pattern.CELength; 1.1623 + int ceindex = celength - 1; 1.1624 + UBool isSafe = TRUE; // indication flag for position in safe zone 1.1625 + 1.1626 + while (ceindex >= 0) { 1.1627 + int32_t textce = ucol_previous(coleiter, status); 1.1628 + if (U_FAILURE(*status)) { 1.1629 + if (isSafe) { 1.1630 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1631 + } 1.1632 + return USEARCH_DONE; 1.1633 + } 1.1634 + if (textce == UCOL_NULLORDER) { 1.1635 + // check if we have passed the safe buffer 1.1636 + if (coleiter == strsrch->textIter) { 1.1637 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1638 + return USEARCH_DONE; 1.1639 + } 1.1640 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1641 + safetext = safebuffer; 1.1642 + coleiter = strsrch->textIter; 1.1643 + setColEIterOffset(coleiter, safeoffset); 1.1644 + // status checked at the start of the loop 1.1645 + isSafe = FALSE; 1.1646 + continue; 1.1647 + } 1.1648 + textce = getCE(strsrch, textce); 1.1649 + if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1.1650 + // do the beginning stuff 1.1651 + int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1.1652 + if (isSafe && failedoffset >= safelength) { 1.1653 + // alas... no hope. failed at rearranged accent set 1.1654 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1655 + return USEARCH_DONE; 1.1656 + } 1.1657 + else { 1.1658 + if (isSafe) { 1.1659 + failedoffset += safeoffset; 1.1660 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1661 + } 1.1662 + 1.1663 + // try rearranging the front accents 1.1664 + int32_t result = doNextCanonicalPrefixMatch(strsrch, 1.1665 + failedoffset, textoffset, status); 1.1666 + if (result != USEARCH_DONE) { 1.1667 + // if status is a failure, ucol_setOffset does nothing 1.1668 + setColEIterOffset(strsrch->textIter, result); 1.1669 + } 1.1670 + if (U_FAILURE(*status)) { 1.1671 + return USEARCH_DONE; 1.1672 + } 1.1673 + return result; 1.1674 + } 1.1675 + } 1.1676 + if (textce == ce[ceindex]) { 1.1677 + ceindex --; 1.1678 + } 1.1679 + } 1.1680 + // set offset here 1.1681 + if (isSafe) { 1.1682 + int32_t result = getColElemIterOffset(coleiter, FALSE); 1.1683 + // sets the text iterator here with the correct expansion and offset 1.1684 + int32_t leftoverces = getExpansionPrefix(coleiter); 1.1685 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.1686 + if (result >= safelength) { 1.1687 + result = textoffset; 1.1688 + } 1.1689 + else { 1.1690 + result += safeoffset; 1.1691 + } 1.1692 + setColEIterOffset(strsrch->textIter, result); 1.1693 + strsrch->textIter->iteratordata_.toReturn = 1.1694 + setExpansionPrefix(strsrch->textIter, leftoverces); 1.1695 + return result; 1.1696 + } 1.1697 + 1.1698 + return ucol_getOffset(coleiter); 1.1699 +} 1.1700 + 1.1701 +/** 1.1702 +* Trying out the substring and sees if it can be a canonical match. 1.1703 +* This will try normalizing the end accents and arranging them into canonical 1.1704 +* equivalents and check their corresponding ces with the pattern ce. 1.1705 +* Suffix accents in the text will be grouped according to their combining 1.1706 +* class and the groups will be mixed and matched to try find the perfect 1.1707 +* match with the pattern. 1.1708 +* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1.1709 +* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1.1710 +* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1.1711 +* "\u0301\u0325". 1.1712 +* step 2: check if any of the generated substrings matches the pattern. 1.1713 +* Internal method, status assumed to be success, caller has to check status 1.1714 +* before calling this method. 1.1715 +* @param strsrch string search data 1.1716 +* @param textoffset end offset in the collation element text that ends with 1.1717 +* the accents to be rearranged 1.1718 +* @param status error status if any 1.1719 +* @return TRUE if the match is valid, FALSE otherwise 1.1720 +*/ 1.1721 +static 1.1722 +UBool doNextCanonicalMatch(UStringSearch *strsrch, 1.1723 + int32_t textoffset, 1.1724 + UErrorCode *status) 1.1725 +{ 1.1726 + const UChar *text = strsrch->search->text; 1.1727 + int32_t temp = textoffset; 1.1728 + U16_BACK_1(text, 0, temp); 1.1729 + if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1.1730 + UCollationElements *coleiter = strsrch->textIter; 1.1731 + int32_t offset = getColElemIterOffset(coleiter, FALSE); 1.1732 + if (strsrch->pattern.hasPrefixAccents) { 1.1733 + offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1.1734 + status); 1.1735 + if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1.1736 + setColEIterOffset(coleiter, offset); 1.1737 + return TRUE; 1.1738 + } 1.1739 + } 1.1740 + return FALSE; 1.1741 + } 1.1742 + 1.1743 + if (!strsrch->pattern.hasSuffixAccents) { 1.1744 + return FALSE; 1.1745 + } 1.1746 + 1.1747 + UChar accents[INITIAL_ARRAY_SIZE_]; 1.1748 + // offset to the last base character in substring to search 1.1749 + int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1.1750 + // normalizing the offensive string 1.1751 + unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1.1752 + 0, accents, INITIAL_ARRAY_SIZE_, status); 1.1753 + // status checked in loop below 1.1754 + 1.1755 + int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1.1756 + int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1.1757 + 1.1758 + // 2 power n - 1 plus the full set of accents 1.1759 + int32_t count = (2 << (size - 1)) - 1; 1.1760 + while (U_SUCCESS(*status) && count > 0) { 1.1761 + UChar *rearrange = strsrch->canonicalSuffixAccents; 1.1762 + // copy the base characters 1.1763 + for (int k = 0; k < accentsindex[0]; k ++) { 1.1764 + *rearrange ++ = accents[k]; 1.1765 + } 1.1766 + // forming all possible canonical rearrangement by dropping 1.1767 + // sets of accents 1.1768 + for (int i = 0; i <= size - 1; i ++) { 1.1769 + int32_t mask = 1 << (size - i - 1); 1.1770 + if (count & mask) { 1.1771 + for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1.1772 + *rearrange ++ = accents[j]; 1.1773 + } 1.1774 + } 1.1775 + } 1.1776 + *rearrange = 0; 1.1777 + int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1.1778 + status); 1.1779 + if (offset != USEARCH_DONE) { 1.1780 + return TRUE; // match found 1.1781 + } 1.1782 + count --; 1.1783 + } 1.1784 + return FALSE; 1.1785 +} 1.1786 + 1.1787 +/** 1.1788 +* Gets the previous base character offset depending on the string search 1.1789 +* pattern data 1.1790 +* @param strsrch string search data 1.1791 +* @param textoffset current offset, current character 1.1792 +* @return the offset of the next character after this base character or itself 1.1793 +* if it is a composed character with accents 1.1794 +*/ 1.1795 +static 1.1796 +inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1.1797 + int32_t textoffset) 1.1798 +{ 1.1799 + if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1.1800 + const UChar *text = strsrch->search->text; 1.1801 + int32_t offset = textoffset; 1.1802 + if (getFCD(text, &offset, strsrch->search->textLength) >> 1.1803 + SECOND_LAST_BYTE_SHIFT_) { 1.1804 + return getPreviousBaseOffset(text, textoffset); 1.1805 + } 1.1806 + } 1.1807 + return textoffset; 1.1808 +} 1.1809 + 1.1810 +/** 1.1811 +* Checks match for contraction. 1.1812 +* If the match ends with a partial contraction we fail. 1.1813 +* If the match starts too far off (because of backwards iteration) we try to 1.1814 +* chip off the extra characters 1.1815 +* Internal method, status assumed to be success, caller has to check status 1.1816 +* before calling this method. 1.1817 +* @param strsrch string search data 1.1818 +* @param start offset of potential match, to be modified if necessary 1.1819 +* @param end offset of potential match, to be modified if necessary 1.1820 +* @param status output error status if any 1.1821 +* @return TRUE if match passes the contraction test, FALSE otherwise 1.1822 +*/ 1.1823 +static 1.1824 +UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1.1825 + int32_t *start, 1.1826 + int32_t *end, 1.1827 + UErrorCode *status) 1.1828 +{ 1.1829 + UCollationElements *coleiter = strsrch->textIter; 1.1830 + int32_t textlength = strsrch->search->textLength; 1.1831 + int32_t temp = *start; 1.1832 + const UCollator *collator = strsrch->collator; 1.1833 + const UChar *text = strsrch->search->text; 1.1834 + // This part checks if either ends of the match contains potential 1.1835 + // contraction. If so we'll have to iterate through them 1.1836 + if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1.1837 + (*start + 1 < textlength 1.1838 + && ucol_unsafeCP(text[*start + 1], collator))) { 1.1839 + int32_t expansion = getExpansionPrefix(coleiter); 1.1840 + UBool expandflag = expansion > 0; 1.1841 + setColEIterOffset(coleiter, *start); 1.1842 + while (expansion > 0) { 1.1843 + // getting rid of the redundant ce, caused by setOffset. 1.1844 + // since backward contraction/expansion may have extra ces if we 1.1845 + // are in the normalization buffer, hasAccentsBeforeMatch would 1.1846 + // have taken care of it. 1.1847 + // E.g. the character \u01FA will have an expansion of 3, but if 1.1848 + // we are only looking for acute and ring \u030A and \u0301, we'll 1.1849 + // have to skip the first ce in the expansion buffer. 1.1850 + ucol_next(coleiter, status); 1.1851 + if (U_FAILURE(*status)) { 1.1852 + return FALSE; 1.1853 + } 1.1854 + if (ucol_getOffset(coleiter) != temp) { 1.1855 + *start = temp; 1.1856 + temp = ucol_getOffset(coleiter); 1.1857 + } 1.1858 + expansion --; 1.1859 + } 1.1860 + 1.1861 + int32_t *patternce = strsrch->pattern.CE; 1.1862 + int32_t patterncelength = strsrch->pattern.CELength; 1.1863 + int32_t count = 0; 1.1864 + int32_t textlength = strsrch->search->textLength; 1.1865 + while (count < patterncelength) { 1.1866 + int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1.1867 + // status checked below, note that if status is a failure 1.1868 + // ucol_next returns UCOL_NULLORDER 1.1869 + if (ce == UCOL_IGNORABLE) { 1.1870 + continue; 1.1871 + } 1.1872 + if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1.1873 + *start = temp; 1.1874 + temp = ucol_getOffset(coleiter); 1.1875 + } 1.1876 + 1.1877 + if (count == 0 && ce != patternce[0]) { 1.1878 + // accents may have extra starting ces, this occurs when a 1.1879 + // pure accent pattern is matched without rearrangement 1.1880 + // text \u0325\u0300 and looking for \u0300 1.1881 + int32_t expected = patternce[0]; 1.1882 + if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1.1883 + ce = getCE(strsrch, ucol_next(coleiter, status)); 1.1884 + while (U_SUCCESS(*status) && ce != expected && 1.1885 + ce != UCOL_NULLORDER && 1.1886 + ucol_getOffset(coleiter) <= *end) { 1.1887 + ce = getCE(strsrch, ucol_next(coleiter, status)); 1.1888 + } 1.1889 + } 1.1890 + } 1.1891 + if (U_FAILURE(*status) || ce != patternce[count]) { 1.1892 + (*end) ++; 1.1893 + *end = getNextUStringSearchBaseOffset(strsrch, *end); 1.1894 + return FALSE; 1.1895 + } 1.1896 + count ++; 1.1897 + } 1.1898 + } 1.1899 + return TRUE; 1.1900 +} 1.1901 + 1.1902 +/** 1.1903 +* Checks and sets the match information if found. 1.1904 +* Checks 1.1905 +* <ul> 1.1906 +* <li> the potential match does not repeat the previous match 1.1907 +* <li> boundaries are correct 1.1908 +* <li> potential match does not end in the middle of a contraction 1.1909 +* <li> identical matches 1.1910 +* <\ul> 1.1911 +* Otherwise the offset will be shifted to the next character. 1.1912 +* Internal method, status assumed to be success, caller has to check the 1.1913 +* status before calling this method. 1.1914 +* @param strsrch string search data 1.1915 +* @param textoffset offset in the collation element text. the returned value 1.1916 +* will be the truncated end offset of the match or the new start 1.1917 +* search offset. 1.1918 +* @param status output error status if any 1.1919 +* @return TRUE if the match is valid, FALSE otherwise 1.1920 +*/ 1.1921 +static 1.1922 +inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1.1923 + int32_t *textoffset, 1.1924 + UErrorCode *status) 1.1925 +{ 1.1926 + // to ensure that the start and ends are not composite characters 1.1927 + UCollationElements *coleiter = strsrch->textIter; 1.1928 + // if we have a canonical accent match 1.1929 + if ((strsrch->pattern.hasSuffixAccents && 1.1930 + strsrch->canonicalSuffixAccents[0]) || 1.1931 + (strsrch->pattern.hasPrefixAccents && 1.1932 + strsrch->canonicalPrefixAccents[0])) { 1.1933 + strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1.1934 + strsrch, 1.1935 + ucol_getOffset(coleiter)); 1.1936 + strsrch->search->matchedLength = *textoffset - 1.1937 + strsrch->search->matchedIndex; 1.1938 + return TRUE; 1.1939 + } 1.1940 + 1.1941 + int32_t start = getColElemIterOffset(coleiter, FALSE); 1.1942 + if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1.1943 + status) || U_FAILURE(*status)) { 1.1944 + return FALSE; 1.1945 + } 1.1946 + 1.1947 + start = getPreviousUStringSearchBaseOffset(strsrch, start); 1.1948 + // this totally matches, however we need to check if it is repeating 1.1949 + if (checkRepeatedMatch(strsrch, start, *textoffset) || 1.1950 + !isBreakUnit(strsrch, start, *textoffset) || 1.1951 + !checkIdentical(strsrch, start, *textoffset)) { 1.1952 + (*textoffset) ++; 1.1953 + *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1.1954 + strsrch->search->textLength); 1.1955 + return FALSE; 1.1956 + } 1.1957 + 1.1958 + strsrch->search->matchedIndex = start; 1.1959 + strsrch->search->matchedLength = *textoffset - start; 1.1960 + return TRUE; 1.1961 +} 1.1962 + 1.1963 +/** 1.1964 +* Shifting the collation element iterator position forward to prepare for 1.1965 +* a preceding match. If the first character is a unsafe character, we'll only 1.1966 +* shift by 1 to capture contractions, normalization etc. 1.1967 +* Internal method, status assumed to be success, caller has to check status 1.1968 +* before calling this method. 1.1969 +* @param text strsrch string search data 1.1970 +* @param textoffset start text position to do search 1.1971 +* @param ce the text ce which failed the match. 1.1972 +* @param patternceindex index of the ce within the pattern ce buffer which 1.1973 +* failed the match 1.1974 +* @return final offset 1.1975 +*/ 1.1976 +static 1.1977 +inline int32_t reverseShift(UStringSearch *strsrch, 1.1978 + int32_t textoffset, 1.1979 + int32_t ce, 1.1980 + int32_t patternceindex) 1.1981 +{ 1.1982 + if (strsrch->search->isOverlap) { 1.1983 + if (textoffset != strsrch->search->textLength) { 1.1984 + textoffset --; 1.1985 + } 1.1986 + else { 1.1987 + textoffset -= strsrch->pattern.defaultShiftSize; 1.1988 + } 1.1989 + } 1.1990 + else { 1.1991 + if (ce != UCOL_NULLORDER) { 1.1992 + int32_t shift = strsrch->pattern.backShift[hash(ce)]; 1.1993 + 1.1994 + // this is to adjust for characters in the middle of the substring 1.1995 + // for matching that failed. 1.1996 + int32_t adjust = patternceindex; 1.1997 + if (adjust > 1 && shift > adjust) { 1.1998 + shift -= adjust - 1; 1.1999 + } 1.2000 + textoffset -= shift; 1.2001 + } 1.2002 + else { 1.2003 + textoffset -= strsrch->pattern.defaultShiftSize; 1.2004 + } 1.2005 + } 1.2006 + textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1.2007 + return textoffset; 1.2008 +} 1.2009 + 1.2010 +/** 1.2011 +* Checks match for contraction. 1.2012 +* If the match starts with a partial contraction we fail. 1.2013 +* Internal method, status assumed to be success, caller has to check status 1.2014 +* before calling this method. 1.2015 +* @param strsrch string search data 1.2016 +* @param start offset of potential match, to be modified if necessary 1.2017 +* @param end offset of potential match, to be modified if necessary 1.2018 +* @param status output error status if any 1.2019 +* @return TRUE if match passes the contraction test, FALSE otherwise 1.2020 +*/ 1.2021 +static 1.2022 +UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 1.2023 + int32_t *start, 1.2024 + int32_t *end, UErrorCode *status) 1.2025 +{ 1.2026 + UCollationElements *coleiter = strsrch->textIter; 1.2027 + int32_t textlength = strsrch->search->textLength; 1.2028 + int32_t temp = *end; 1.2029 + const UCollator *collator = strsrch->collator; 1.2030 + const UChar *text = strsrch->search->text; 1.2031 + // This part checks if either if the start of the match contains potential 1.2032 + // contraction. If so we'll have to iterate through them 1.2033 + // Since we used ucol_next while previously looking for the potential 1.2034 + // match, this guarantees that our end will not be a partial contraction, 1.2035 + // or a partial supplementary character. 1.2036 + if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 1.2037 + int32_t expansion = getExpansionSuffix(coleiter); 1.2038 + UBool expandflag = expansion > 0; 1.2039 + setColEIterOffset(coleiter, *end); 1.2040 + while (U_SUCCESS(*status) && expansion > 0) { 1.2041 + // getting rid of the redundant ce 1.2042 + // since forward contraction/expansion may have extra ces 1.2043 + // if we are in the normalization buffer, hasAccentsBeforeMatch 1.2044 + // would have taken care of it. 1.2045 + // E.g. the character \u01FA will have an expansion of 3, but if 1.2046 + // we are only looking for A ring A\u030A, we'll have to skip the 1.2047 + // last ce in the expansion buffer 1.2048 + ucol_previous(coleiter, status); 1.2049 + if (U_FAILURE(*status)) { 1.2050 + return FALSE; 1.2051 + } 1.2052 + if (ucol_getOffset(coleiter) != temp) { 1.2053 + *end = temp; 1.2054 + temp = ucol_getOffset(coleiter); 1.2055 + } 1.2056 + expansion --; 1.2057 + } 1.2058 + 1.2059 + int32_t *patternce = strsrch->pattern.CE; 1.2060 + int32_t patterncelength = strsrch->pattern.CELength; 1.2061 + int32_t count = patterncelength; 1.2062 + while (count > 0) { 1.2063 + int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 1.2064 + // status checked below, note that if status is a failure 1.2065 + // ucol_previous returns UCOL_NULLORDER 1.2066 + if (ce == UCOL_IGNORABLE) { 1.2067 + continue; 1.2068 + } 1.2069 + if (expandflag && count == 0 && 1.2070 + getColElemIterOffset(coleiter, FALSE) != temp) { 1.2071 + *end = temp; 1.2072 + temp = ucol_getOffset(coleiter); 1.2073 + } 1.2074 + if (U_FAILURE(*status) || ce != patternce[count - 1]) { 1.2075 + (*start) --; 1.2076 + *start = getPreviousBaseOffset(text, *start); 1.2077 + return FALSE; 1.2078 + } 1.2079 + count --; 1.2080 + } 1.2081 + } 1.2082 + return TRUE; 1.2083 +} 1.2084 + 1.2085 +/** 1.2086 +* Checks and sets the match information if found. 1.2087 +* Checks 1.2088 +* <ul> 1.2089 +* <li> the current match does not repeat the last match 1.2090 +* <li> boundaries are correct 1.2091 +* <li> exact matches has no extra accents 1.2092 +* <li> identical matches 1.2093 +* <\ul> 1.2094 +* Otherwise the offset will be shifted to the preceding character. 1.2095 +* Internal method, status assumed to be success, caller has to check status 1.2096 +* before calling this method. 1.2097 +* @param strsrch string search data 1.2098 +* @param collator 1.2099 +* @param coleiter collation element iterator 1.2100 +* @param text string 1.2101 +* @param textoffset offset in the collation element text. the returned value 1.2102 +* will be the truncated start offset of the match or the new start 1.2103 +* search offset. 1.2104 +* @param status output error status if any 1.2105 +* @return TRUE if the match is valid, FALSE otherwise 1.2106 +*/ 1.2107 +static 1.2108 +inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 1.2109 + int32_t *textoffset, 1.2110 + UErrorCode *status) 1.2111 +{ 1.2112 + // to ensure that the start and ends are not composite characters 1.2113 + int32_t end = ucol_getOffset(strsrch->textIter); 1.2114 + if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 1.2115 + || U_FAILURE(*status)) { 1.2116 + return FALSE; 1.2117 + } 1.2118 + 1.2119 + // this totally matches, however we need to check if it is repeating 1.2120 + // the old match 1.2121 + if (checkRepeatedMatch(strsrch, *textoffset, end) || 1.2122 + !isBreakUnit(strsrch, *textoffset, end) || 1.2123 + hasAccentsBeforeMatch(strsrch, *textoffset, end) || 1.2124 + !checkIdentical(strsrch, *textoffset, end) || 1.2125 + hasAccentsAfterMatch(strsrch, *textoffset, end)) { 1.2126 + (*textoffset) --; 1.2127 + *textoffset = getPreviousBaseOffset(strsrch->search->text, 1.2128 + *textoffset); 1.2129 + return FALSE; 1.2130 + } 1.2131 + 1.2132 + //Add breakiterator boundary check for primary strength search. 1.2133 + if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1.2134 + checkBreakBoundary(strsrch, textoffset, &end); 1.2135 + } 1.2136 + 1.2137 + strsrch->search->matchedIndex = *textoffset; 1.2138 + strsrch->search->matchedLength = end - *textoffset; 1.2139 + return TRUE; 1.2140 +} 1.2141 + 1.2142 +/** 1.2143 +* Rearranges the end accents to try matching. 1.2144 +* Suffix accents in the text will be grouped according to their combining 1.2145 +* class and the groups will be mixed and matched to try find the perfect 1.2146 +* match with the pattern. 1.2147 +* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1.2148 +* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1.2149 +* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1.2150 +* "\u0301\u0325". 1.2151 +* step 2: check if any of the generated substrings matches the pattern. 1.2152 +* Internal method, status assumed to be success, user has to check status 1.2153 +* before calling this method. 1.2154 +* @param strsrch string search match 1.2155 +* @param start offset of the first base character 1.2156 +* @param end start of the last accent set 1.2157 +* @param status only error status if any 1.2158 +* @return USEARCH_DONE if a match is not found, otherwise return the ending 1.2159 +* offset of the match. Note this start includes all following accents. 1.2160 +*/ 1.2161 +static 1.2162 +int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 1.2163 + int32_t start, 1.2164 + int32_t end, 1.2165 + UErrorCode *status) 1.2166 +{ 1.2167 + const UChar *text = strsrch->search->text; 1.2168 + int32_t tempend = end; 1.2169 + 1.2170 + U16_BACK_1(text, 0, tempend); 1.2171 + if (!(getFCD(text, &tempend, strsrch->search->textLength) & 1.2172 + LAST_BYTE_MASK_)) { 1.2173 + // die... failed at a base character 1.2174 + return USEARCH_DONE; 1.2175 + } 1.2176 + end = getNextBaseOffset(text, end, strsrch->search->textLength); 1.2177 + 1.2178 + if (U_SUCCESS(*status)) { 1.2179 + UChar accents[INITIAL_ARRAY_SIZE_]; 1.2180 + int32_t offset = getPreviousBaseOffset(text, end); 1.2181 + // normalizing the offensive string 1.2182 + unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 1.2183 + INITIAL_ARRAY_SIZE_, status); 1.2184 + 1.2185 + int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1.2186 + int32_t accentsize = getUnblockedAccentIndex(accents, 1.2187 + accentsindex); 1.2188 + int32_t count = (2 << (accentsize - 1)) - 1; 1.2189 + UChar buffer[INITIAL_ARRAY_SIZE_]; 1.2190 + UCollationElements *coleiter = strsrch->utilIter; 1.2191 + while (U_SUCCESS(*status) && count > 0) { 1.2192 + UChar *rearrange = strsrch->canonicalSuffixAccents; 1.2193 + // copy the base characters 1.2194 + for (int k = 0; k < accentsindex[0]; k ++) { 1.2195 + *rearrange ++ = accents[k]; 1.2196 + } 1.2197 + // forming all possible canonical rearrangement by dropping 1.2198 + // sets of accents 1.2199 + for (int i = 0; i <= accentsize - 1; i ++) { 1.2200 + int32_t mask = 1 << (accentsize - i - 1); 1.2201 + if (count & mask) { 1.2202 + for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1.2203 + *rearrange ++ = accents[j]; 1.2204 + } 1.2205 + } 1.2206 + } 1.2207 + *rearrange = 0; 1.2208 + int32_t matchsize = INITIAL_ARRAY_SIZE_; 1.2209 + UChar *match = addToUCharArray(buffer, &matchsize, 1.2210 + strsrch->canonicalPrefixAccents, 1.2211 + strsrch->search->text + start, 1.2212 + offset - start, 1.2213 + strsrch->canonicalSuffixAccents, 1.2214 + status); 1.2215 + 1.2216 + // run the collator iterator through this match 1.2217 + // if status is a failure ucol_setText does nothing 1.2218 + ucol_setText(coleiter, match, matchsize, status); 1.2219 + if (U_SUCCESS(*status)) { 1.2220 + if (checkCollationMatch(strsrch, coleiter)) { 1.2221 + if (match != buffer) { 1.2222 + uprv_free(match); 1.2223 + } 1.2224 + return end; 1.2225 + } 1.2226 + } 1.2227 + count --; 1.2228 + } 1.2229 + } 1.2230 + return USEARCH_DONE; 1.2231 +} 1.2232 + 1.2233 +/** 1.2234 +* Take the rearranged start accents and tries matching. If match failed at 1.2235 +* a seperate following set of accents (seperated from the rearranged on by 1.2236 +* at least a base character) then we rearrange the preceding accents and 1.2237 +* tries matching again. 1.2238 +* We allow skipping of the ends of the accent set if the ces do not match. 1.2239 +* However if the failure is found before the accent set, it fails. 1.2240 +* Internal method, status assumed to be success, caller has to check status 1.2241 +* before calling this method. 1.2242 +* @param strsrch string search data 1.2243 +* @param textoffset of the ends of the rearranged accent 1.2244 +* @param status output error status if any 1.2245 +* @return USEARCH_DONE if a match is not found, otherwise return the ending 1.2246 +* offset of the match. Note this start includes all following accents. 1.2247 +*/ 1.2248 +static 1.2249 +int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 1.2250 + int32_t textoffset, 1.2251 + UErrorCode *status) 1.2252 +{ 1.2253 + const UChar *text = strsrch->search->text; 1.2254 + const UCollator *collator = strsrch->collator; 1.2255 + int32_t safelength = 0; 1.2256 + UChar *safetext; 1.2257 + int32_t safetextlength; 1.2258 + UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1.2259 + int32_t safeoffset = textoffset; 1.2260 + 1.2261 + if (textoffset && 1.2262 + ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 1.2263 + u_strlen(strsrch->canonicalPrefixAccents) - 1 1.2264 + ], collator)) { 1.2265 + safeoffset = getNextSafeOffset(collator, text, textoffset, 1.2266 + strsrch->search->textLength); 1.2267 + safelength = safeoffset - textoffset; 1.2268 + safetextlength = INITIAL_ARRAY_SIZE_; 1.2269 + safetext = addToUCharArray(safebuffer, &safetextlength, 1.2270 + strsrch->canonicalPrefixAccents, 1.2271 + text + textoffset, safelength, 1.2272 + NULL, status); 1.2273 + } 1.2274 + else { 1.2275 + safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 1.2276 + safetext = strsrch->canonicalPrefixAccents; 1.2277 + } 1.2278 + 1.2279 + UCollationElements *coleiter = strsrch->utilIter; 1.2280 + // if status is a failure, ucol_setText does nothing 1.2281 + ucol_setText(coleiter, safetext, safetextlength, status); 1.2282 + // status checked in loop below 1.2283 + 1.2284 + int32_t *ce = strsrch->pattern.CE; 1.2285 + int32_t celength = strsrch->pattern.CELength; 1.2286 + int ceindex = 0; 1.2287 + UBool isSafe = TRUE; // safe zone indication flag for position 1.2288 + int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 1.2289 + 1.2290 + while (ceindex < celength) { 1.2291 + int32_t textce = ucol_next(coleiter, status); 1.2292 + if (U_FAILURE(*status)) { 1.2293 + if (isSafe) { 1.2294 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2295 + } 1.2296 + return USEARCH_DONE; 1.2297 + } 1.2298 + if (textce == UCOL_NULLORDER) { 1.2299 + // check if we have passed the safe buffer 1.2300 + if (coleiter == strsrch->textIter) { 1.2301 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2302 + return USEARCH_DONE; 1.2303 + } 1.2304 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2305 + safetext = safebuffer; 1.2306 + coleiter = strsrch->textIter; 1.2307 + setColEIterOffset(coleiter, safeoffset); 1.2308 + // status checked at the start of the loop 1.2309 + isSafe = FALSE; 1.2310 + continue; 1.2311 + } 1.2312 + textce = getCE(strsrch, textce); 1.2313 + if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1.2314 + // do the beginning stuff 1.2315 + int32_t failedoffset = ucol_getOffset(coleiter); 1.2316 + if (isSafe && failedoffset <= prefixlength) { 1.2317 + // alas... no hope. failed at rearranged accent set 1.2318 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2319 + return USEARCH_DONE; 1.2320 + } 1.2321 + else { 1.2322 + if (isSafe) { 1.2323 + failedoffset = safeoffset - failedoffset; 1.2324 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2325 + } 1.2326 + 1.2327 + // try rearranging the end accents 1.2328 + int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 1.2329 + textoffset, failedoffset, status); 1.2330 + if (result != USEARCH_DONE) { 1.2331 + // if status is a failure, ucol_setOffset does nothing 1.2332 + setColEIterOffset(strsrch->textIter, result); 1.2333 + } 1.2334 + if (U_FAILURE(*status)) { 1.2335 + return USEARCH_DONE; 1.2336 + } 1.2337 + return result; 1.2338 + } 1.2339 + } 1.2340 + if (textce == ce[ceindex]) { 1.2341 + ceindex ++; 1.2342 + } 1.2343 + } 1.2344 + // set offset here 1.2345 + if (isSafe) { 1.2346 + int32_t result = ucol_getOffset(coleiter); 1.2347 + // sets the text iterator here with the correct expansion and offset 1.2348 + int32_t leftoverces = getExpansionSuffix(coleiter); 1.2349 + cleanUpSafeText(strsrch, safetext, safebuffer); 1.2350 + if (result <= prefixlength) { 1.2351 + result = textoffset; 1.2352 + } 1.2353 + else { 1.2354 + result = textoffset + (safeoffset - result); 1.2355 + } 1.2356 + setColEIterOffset(strsrch->textIter, result); 1.2357 + setExpansionSuffix(strsrch->textIter, leftoverces); 1.2358 + return result; 1.2359 + } 1.2360 + 1.2361 + return ucol_getOffset(coleiter); 1.2362 +} 1.2363 + 1.2364 +/** 1.2365 +* Trying out the substring and sees if it can be a canonical match. 1.2366 +* This will try normalizing the starting accents and arranging them into 1.2367 +* canonical equivalents and check their corresponding ces with the pattern ce. 1.2368 +* Prefix accents in the text will be grouped according to their combining 1.2369 +* class and the groups will be mixed and matched to try find the perfect 1.2370 +* match with the pattern. 1.2371 +* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1.2372 +* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1.2373 +* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1.2374 +* "\u0301\u0325". 1.2375 +* step 2: check if any of the generated substrings matches the pattern. 1.2376 +* Internal method, status assumed to be success, caller has to check status 1.2377 +* before calling this method. 1.2378 +* @param strsrch string search data 1.2379 +* @param textoffset start offset in the collation element text that starts 1.2380 +* with the accents to be rearranged 1.2381 +* @param status output error status if any 1.2382 +* @return TRUE if the match is valid, FALSE otherwise 1.2383 +*/ 1.2384 +static 1.2385 +UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 1.2386 + int32_t textoffset, 1.2387 + UErrorCode *status) 1.2388 +{ 1.2389 + const UChar *text = strsrch->search->text; 1.2390 + int32_t temp = textoffset; 1.2391 + int32_t textlength = strsrch->search->textLength; 1.2392 + if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1.2393 + UCollationElements *coleiter = strsrch->textIter; 1.2394 + int32_t offset = ucol_getOffset(coleiter); 1.2395 + if (strsrch->pattern.hasSuffixAccents) { 1.2396 + offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 1.2397 + offset, status); 1.2398 + if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1.2399 + setColEIterOffset(coleiter, offset); 1.2400 + return TRUE; 1.2401 + } 1.2402 + } 1.2403 + return FALSE; 1.2404 + } 1.2405 + 1.2406 + if (!strsrch->pattern.hasPrefixAccents) { 1.2407 + return FALSE; 1.2408 + } 1.2409 + 1.2410 + UChar accents[INITIAL_ARRAY_SIZE_]; 1.2411 + // offset to the last base character in substring to search 1.2412 + int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 1.2413 + // normalizing the offensive string 1.2414 + unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 1.2415 + 0, accents, INITIAL_ARRAY_SIZE_, status); 1.2416 + // status checked in loop 1.2417 + 1.2418 + int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1.2419 + int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1.2420 + 1.2421 + // 2 power n - 1 plus the full set of accents 1.2422 + int32_t count = (2 << (size - 1)) - 1; 1.2423 + while (U_SUCCESS(*status) && count > 0) { 1.2424 + UChar *rearrange = strsrch->canonicalPrefixAccents; 1.2425 + // copy the base characters 1.2426 + for (int k = 0; k < accentsindex[0]; k ++) { 1.2427 + *rearrange ++ = accents[k]; 1.2428 + } 1.2429 + // forming all possible canonical rearrangement by dropping 1.2430 + // sets of accents 1.2431 + for (int i = 0; i <= size - 1; i ++) { 1.2432 + int32_t mask = 1 << (size - i - 1); 1.2433 + if (count & mask) { 1.2434 + for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1.2435 + *rearrange ++ = accents[j]; 1.2436 + } 1.2437 + } 1.2438 + } 1.2439 + *rearrange = 0; 1.2440 + int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 1.2441 + baseoffset, status); 1.2442 + if (offset != USEARCH_DONE) { 1.2443 + return TRUE; // match found 1.2444 + } 1.2445 + count --; 1.2446 + } 1.2447 + return FALSE; 1.2448 +} 1.2449 + 1.2450 +/** 1.2451 +* Checks match for contraction. 1.2452 +* If the match starts with a partial contraction we fail. 1.2453 +* Internal method, status assumed to be success, caller has to check status 1.2454 +* before calling this method. 1.2455 +* @param strsrch string search data 1.2456 +* @param start offset of potential match, to be modified if necessary 1.2457 +* @param end offset of potential match, to be modified if necessary 1.2458 +* @param status only error status if any 1.2459 +* @return TRUE if match passes the contraction test, FALSE otherwise 1.2460 +*/ 1.2461 +static 1.2462 +UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 1.2463 + int32_t *start, 1.2464 + int32_t *end, UErrorCode *status) 1.2465 +{ 1.2466 + UCollationElements *coleiter = strsrch->textIter; 1.2467 + int32_t textlength = strsrch->search->textLength; 1.2468 + int32_t temp = *end; 1.2469 + const UCollator *collator = strsrch->collator; 1.2470 + const UChar *text = strsrch->search->text; 1.2471 + // This part checks if either if the start of the match contains potential 1.2472 + // contraction. If so we'll have to iterate through them 1.2473 + // Since we used ucol_next while previously looking for the potential 1.2474 + // match, this guarantees that our end will not be a partial contraction, 1.2475 + // or a partial supplementary character. 1.2476 + if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 1.2477 + int32_t expansion = getExpansionSuffix(coleiter); 1.2478 + UBool expandflag = expansion > 0; 1.2479 + setColEIterOffset(coleiter, *end); 1.2480 + while (expansion > 0) { 1.2481 + // getting rid of the redundant ce 1.2482 + // since forward contraction/expansion may have extra ces 1.2483 + // if we are in the normalization buffer, hasAccentsBeforeMatch 1.2484 + // would have taken care of it. 1.2485 + // E.g. the character \u01FA will have an expansion of 3, but if 1.2486 + // we are only looking for A ring A\u030A, we'll have to skip the 1.2487 + // last ce in the expansion buffer 1.2488 + ucol_previous(coleiter, status); 1.2489 + if (U_FAILURE(*status)) { 1.2490 + return FALSE; 1.2491 + } 1.2492 + if (ucol_getOffset(coleiter) != temp) { 1.2493 + *end = temp; 1.2494 + temp = ucol_getOffset(coleiter); 1.2495 + } 1.2496 + expansion --; 1.2497 + } 1.2498 + 1.2499 + int32_t *patternce = strsrch->pattern.CE; 1.2500 + int32_t patterncelength = strsrch->pattern.CELength; 1.2501 + int32_t count = patterncelength; 1.2502 + while (count > 0) { 1.2503 + int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 1.2504 + // status checked below, note that if status is a failure 1.2505 + // ucol_previous returns UCOL_NULLORDER 1.2506 + if (ce == UCOL_IGNORABLE) { 1.2507 + continue; 1.2508 + } 1.2509 + if (expandflag && count == 0 && 1.2510 + getColElemIterOffset(coleiter, FALSE) != temp) { 1.2511 + *end = temp; 1.2512 + temp = ucol_getOffset(coleiter); 1.2513 + } 1.2514 + if (count == patterncelength && 1.2515 + ce != patternce[patterncelength - 1]) { 1.2516 + // accents may have extra starting ces, this occurs when a 1.2517 + // pure accent pattern is matched without rearrangement 1.2518 + int32_t expected = patternce[patterncelength - 1]; 1.2519 + U16_BACK_1(text, 0, *end); 1.2520 + if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 1.2521 + ce = getCE(strsrch, ucol_previous(coleiter, status)); 1.2522 + while (U_SUCCESS(*status) && ce != expected && 1.2523 + ce != UCOL_NULLORDER && 1.2524 + ucol_getOffset(coleiter) <= *start) { 1.2525 + ce = getCE(strsrch, ucol_previous(coleiter, status)); 1.2526 + } 1.2527 + } 1.2528 + } 1.2529 + if (U_FAILURE(*status) || ce != patternce[count - 1]) { 1.2530 + (*start) --; 1.2531 + *start = getPreviousBaseOffset(text, *start); 1.2532 + return FALSE; 1.2533 + } 1.2534 + count --; 1.2535 + } 1.2536 + } 1.2537 + return TRUE; 1.2538 +} 1.2539 + 1.2540 +/** 1.2541 +* Checks and sets the match information if found. 1.2542 +* Checks 1.2543 +* <ul> 1.2544 +* <li> the potential match does not repeat the previous match 1.2545 +* <li> boundaries are correct 1.2546 +* <li> potential match does not end in the middle of a contraction 1.2547 +* <li> identical matches 1.2548 +* <\ul> 1.2549 +* Otherwise the offset will be shifted to the next character. 1.2550 +* Internal method, status assumed to be success, caller has to check status 1.2551 +* before calling this method. 1.2552 +* @param strsrch string search data 1.2553 +* @param textoffset offset in the collation element text. the returned value 1.2554 +* will be the truncated start offset of the match or the new start 1.2555 +* search offset. 1.2556 +* @param status only error status if any 1.2557 +* @return TRUE if the match is valid, FALSE otherwise 1.2558 +*/ 1.2559 +static 1.2560 +inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 1.2561 + int32_t *textoffset, 1.2562 + UErrorCode *status) 1.2563 +{ 1.2564 + // to ensure that the start and ends are not composite characters 1.2565 + UCollationElements *coleiter = strsrch->textIter; 1.2566 + // if we have a canonical accent match 1.2567 + if ((strsrch->pattern.hasSuffixAccents && 1.2568 + strsrch->canonicalSuffixAccents[0]) || 1.2569 + (strsrch->pattern.hasPrefixAccents && 1.2570 + strsrch->canonicalPrefixAccents[0])) { 1.2571 + strsrch->search->matchedIndex = *textoffset; 1.2572 + strsrch->search->matchedLength = 1.2573 + getNextUStringSearchBaseOffset(strsrch, 1.2574 + getColElemIterOffset(coleiter, FALSE)) 1.2575 + - *textoffset; 1.2576 + return TRUE; 1.2577 + } 1.2578 + 1.2579 + int32_t end = ucol_getOffset(coleiter); 1.2580 + if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 1.2581 + status) || 1.2582 + U_FAILURE(*status)) { 1.2583 + return FALSE; 1.2584 + } 1.2585 + 1.2586 + end = getNextUStringSearchBaseOffset(strsrch, end); 1.2587 + // this totally matches, however we need to check if it is repeating 1.2588 + if (checkRepeatedMatch(strsrch, *textoffset, end) || 1.2589 + !isBreakUnit(strsrch, *textoffset, end) || 1.2590 + !checkIdentical(strsrch, *textoffset, end)) { 1.2591 + (*textoffset) --; 1.2592 + *textoffset = getPreviousBaseOffset(strsrch->search->text, 1.2593 + *textoffset); 1.2594 + return FALSE; 1.2595 + } 1.2596 + 1.2597 + strsrch->search->matchedIndex = *textoffset; 1.2598 + strsrch->search->matchedLength = end - *textoffset; 1.2599 + return TRUE; 1.2600 +} 1.2601 +#endif // #if BOYER_MOORE 1.2602 + 1.2603 +// constructors and destructor ------------------------------------------- 1.2604 + 1.2605 +U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 1.2606 + int32_t patternlength, 1.2607 + const UChar *text, 1.2608 + int32_t textlength, 1.2609 + const char *locale, 1.2610 + UBreakIterator *breakiter, 1.2611 + UErrorCode *status) 1.2612 +{ 1.2613 + if (U_FAILURE(*status)) { 1.2614 + return NULL; 1.2615 + } 1.2616 +#if UCONFIG_NO_BREAK_ITERATION 1.2617 + if (breakiter != NULL) { 1.2618 + *status = U_UNSUPPORTED_ERROR; 1.2619 + return NULL; 1.2620 + } 1.2621 +#endif 1.2622 + if (locale) { 1.2623 + // ucol_open internally checks for status 1.2624 + UCollator *collator = ucol_open(locale, status); 1.2625 + // pattern, text checks are done in usearch_openFromCollator 1.2626 + UStringSearch *result = usearch_openFromCollator(pattern, 1.2627 + patternlength, text, textlength, 1.2628 + collator, breakiter, status); 1.2629 + 1.2630 + if (result == NULL || U_FAILURE(*status)) { 1.2631 + if (collator) { 1.2632 + ucol_close(collator); 1.2633 + } 1.2634 + return NULL; 1.2635 + } 1.2636 + else { 1.2637 + result->ownCollator = TRUE; 1.2638 + } 1.2639 + return result; 1.2640 + } 1.2641 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2642 + return NULL; 1.2643 +} 1.2644 + 1.2645 +U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 1.2646 + const UChar *pattern, 1.2647 + int32_t patternlength, 1.2648 + const UChar *text, 1.2649 + int32_t textlength, 1.2650 + const UCollator *collator, 1.2651 + UBreakIterator *breakiter, 1.2652 + UErrorCode *status) 1.2653 +{ 1.2654 + if (U_FAILURE(*status)) { 1.2655 + return NULL; 1.2656 + } 1.2657 +#if UCONFIG_NO_BREAK_ITERATION 1.2658 + if (breakiter != NULL) { 1.2659 + *status = U_UNSUPPORTED_ERROR; 1.2660 + return NULL; 1.2661 + } 1.2662 +#endif 1.2663 + if (pattern == NULL || text == NULL || collator == NULL) { 1.2664 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2665 + return NULL; 1.2666 + } 1.2667 + 1.2668 + // string search does not really work when numeric collation is turned on 1.2669 + if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 1.2670 + *status = U_UNSUPPORTED_ERROR; 1.2671 + return NULL; 1.2672 + } 1.2673 + 1.2674 + if (U_SUCCESS(*status)) { 1.2675 + initializeFCD(status); 1.2676 + if (U_FAILURE(*status)) { 1.2677 + return NULL; 1.2678 + } 1.2679 + 1.2680 + UStringSearch *result; 1.2681 + if (textlength == -1) { 1.2682 + textlength = u_strlen(text); 1.2683 + } 1.2684 + if (patternlength == -1) { 1.2685 + patternlength = u_strlen(pattern); 1.2686 + } 1.2687 + if (textlength <= 0 || patternlength <= 0) { 1.2688 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2689 + return NULL; 1.2690 + } 1.2691 + 1.2692 + result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 1.2693 + if (result == NULL) { 1.2694 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2695 + return NULL; 1.2696 + } 1.2697 + 1.2698 + result->collator = collator; 1.2699 + result->strength = ucol_getStrength(collator); 1.2700 + result->ceMask = getMask(result->strength); 1.2701 + result->toShift = 1.2702 + ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 1.2703 + UCOL_SHIFTED; 1.2704 + result->variableTop = ucol_getVariableTop(collator, status); 1.2705 + 1.2706 + result->nfd = Normalizer2Factory::getNFDInstance(*status); 1.2707 + 1.2708 + if (U_FAILURE(*status)) { 1.2709 + uprv_free(result); 1.2710 + return NULL; 1.2711 + } 1.2712 + 1.2713 + result->search = (USearch *)uprv_malloc(sizeof(USearch)); 1.2714 + if (result->search == NULL) { 1.2715 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2716 + uprv_free(result); 1.2717 + return NULL; 1.2718 + } 1.2719 + 1.2720 + result->search->text = text; 1.2721 + result->search->textLength = textlength; 1.2722 + 1.2723 + result->pattern.text = pattern; 1.2724 + result->pattern.textLength = patternlength; 1.2725 + result->pattern.CE = NULL; 1.2726 + result->pattern.PCE = NULL; 1.2727 + 1.2728 + result->search->breakIter = breakiter; 1.2729 +#if !UCONFIG_NO_BREAK_ITERATION 1.2730 + result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 1.2731 + if (breakiter) { 1.2732 + ubrk_setText(breakiter, text, textlength, status); 1.2733 + } 1.2734 +#endif 1.2735 + 1.2736 + result->ownCollator = FALSE; 1.2737 + result->search->matchedLength = 0; 1.2738 + result->search->matchedIndex = USEARCH_DONE; 1.2739 + result->utilIter = NULL; 1.2740 + result->textIter = ucol_openElements(collator, text, 1.2741 + textlength, status); 1.2742 + if (U_FAILURE(*status)) { 1.2743 + usearch_close(result); 1.2744 + return NULL; 1.2745 + } 1.2746 + 1.2747 + result->search->isOverlap = FALSE; 1.2748 + result->search->isCanonicalMatch = FALSE; 1.2749 + result->search->elementComparisonType = 0; 1.2750 + result->search->isForwardSearching = TRUE; 1.2751 + result->search->reset = TRUE; 1.2752 + 1.2753 + initialize(result, status); 1.2754 + 1.2755 + if (U_FAILURE(*status)) { 1.2756 + usearch_close(result); 1.2757 + return NULL; 1.2758 + } 1.2759 + 1.2760 + return result; 1.2761 + } 1.2762 + return NULL; 1.2763 +} 1.2764 + 1.2765 +U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 1.2766 +{ 1.2767 + if (strsrch) { 1.2768 + if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && 1.2769 + strsrch->pattern.CE) { 1.2770 + uprv_free(strsrch->pattern.CE); 1.2771 + } 1.2772 + 1.2773 + if (strsrch->pattern.PCE != NULL && 1.2774 + strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 1.2775 + uprv_free(strsrch->pattern.PCE); 1.2776 + } 1.2777 + 1.2778 + ucol_closeElements(strsrch->textIter); 1.2779 + ucol_closeElements(strsrch->utilIter); 1.2780 + 1.2781 + if (strsrch->ownCollator && strsrch->collator) { 1.2782 + ucol_close((UCollator *)strsrch->collator); 1.2783 + } 1.2784 + 1.2785 +#if !UCONFIG_NO_BREAK_ITERATION 1.2786 + if (strsrch->search->internalBreakIter) { 1.2787 + ubrk_close(strsrch->search->internalBreakIter); 1.2788 + } 1.2789 +#endif 1.2790 + 1.2791 + uprv_free(strsrch->search); 1.2792 + uprv_free(strsrch); 1.2793 + } 1.2794 +} 1.2795 + 1.2796 +// set and get methods -------------------------------------------------- 1.2797 + 1.2798 +U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 1.2799 + int32_t position, 1.2800 + UErrorCode *status) 1.2801 +{ 1.2802 + if (U_SUCCESS(*status) && strsrch) { 1.2803 + if (isOutOfBounds(strsrch->search->textLength, position)) { 1.2804 + *status = U_INDEX_OUTOFBOUNDS_ERROR; 1.2805 + } 1.2806 + else { 1.2807 + setColEIterOffset(strsrch->textIter, position); 1.2808 + } 1.2809 + strsrch->search->matchedIndex = USEARCH_DONE; 1.2810 + strsrch->search->matchedLength = 0; 1.2811 + strsrch->search->reset = FALSE; 1.2812 + } 1.2813 +} 1.2814 + 1.2815 +U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 1.2816 +{ 1.2817 + if (strsrch) { 1.2818 + int32_t result = ucol_getOffset(strsrch->textIter); 1.2819 + if (isOutOfBounds(strsrch->search->textLength, result)) { 1.2820 + return USEARCH_DONE; 1.2821 + } 1.2822 + return result; 1.2823 + } 1.2824 + return USEARCH_DONE; 1.2825 +} 1.2826 + 1.2827 +U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 1.2828 + USearchAttribute attribute, 1.2829 + USearchAttributeValue value, 1.2830 + UErrorCode *status) 1.2831 +{ 1.2832 + if (U_SUCCESS(*status) && strsrch) { 1.2833 + switch (attribute) 1.2834 + { 1.2835 + case USEARCH_OVERLAP : 1.2836 + strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 1.2837 + break; 1.2838 + case USEARCH_CANONICAL_MATCH : 1.2839 + strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 1.2840 + FALSE); 1.2841 + break; 1.2842 + case USEARCH_ELEMENT_COMPARISON : 1.2843 + if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 1.2844 + strsrch->search->elementComparisonType = (int16_t)value; 1.2845 + } else { 1.2846 + strsrch->search->elementComparisonType = 0; 1.2847 + } 1.2848 + break; 1.2849 + case USEARCH_ATTRIBUTE_COUNT : 1.2850 + default: 1.2851 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2852 + } 1.2853 + } 1.2854 + if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 1.2855 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2856 + } 1.2857 +} 1.2858 + 1.2859 +U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 1.2860 + const UStringSearch *strsrch, 1.2861 + USearchAttribute attribute) 1.2862 +{ 1.2863 + if (strsrch) { 1.2864 + switch (attribute) { 1.2865 + case USEARCH_OVERLAP : 1.2866 + return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 1.2867 + USEARCH_OFF); 1.2868 + case USEARCH_CANONICAL_MATCH : 1.2869 + return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 1.2870 + USEARCH_OFF); 1.2871 + case USEARCH_ELEMENT_COMPARISON : 1.2872 + { 1.2873 + int16_t value = strsrch->search->elementComparisonType; 1.2874 + if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 1.2875 + return (USearchAttributeValue)value; 1.2876 + } else { 1.2877 + return USEARCH_STANDARD_ELEMENT_COMPARISON; 1.2878 + } 1.2879 + } 1.2880 + case USEARCH_ATTRIBUTE_COUNT : 1.2881 + return USEARCH_DEFAULT; 1.2882 + } 1.2883 + } 1.2884 + return USEARCH_DEFAULT; 1.2885 +} 1.2886 + 1.2887 +U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 1.2888 + const UStringSearch *strsrch) 1.2889 +{ 1.2890 + if (strsrch == NULL) { 1.2891 + return USEARCH_DONE; 1.2892 + } 1.2893 + return strsrch->search->matchedIndex; 1.2894 +} 1.2895 + 1.2896 + 1.2897 +U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 1.2898 + UChar *result, 1.2899 + int32_t resultCapacity, 1.2900 + UErrorCode *status) 1.2901 +{ 1.2902 + if (U_FAILURE(*status)) { 1.2903 + return USEARCH_DONE; 1.2904 + } 1.2905 + if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 1.2906 + result == NULL)) { 1.2907 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2908 + return USEARCH_DONE; 1.2909 + } 1.2910 + 1.2911 + int32_t copylength = strsrch->search->matchedLength; 1.2912 + int32_t copyindex = strsrch->search->matchedIndex; 1.2913 + if (copyindex == USEARCH_DONE) { 1.2914 + u_terminateUChars(result, resultCapacity, 0, status); 1.2915 + return USEARCH_DONE; 1.2916 + } 1.2917 + 1.2918 + if (resultCapacity < copylength) { 1.2919 + copylength = resultCapacity; 1.2920 + } 1.2921 + if (copylength > 0) { 1.2922 + uprv_memcpy(result, strsrch->search->text + copyindex, 1.2923 + copylength * sizeof(UChar)); 1.2924 + } 1.2925 + return u_terminateUChars(result, resultCapacity, 1.2926 + strsrch->search->matchedLength, status); 1.2927 +} 1.2928 + 1.2929 +U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 1.2930 + const UStringSearch *strsrch) 1.2931 +{ 1.2932 + if (strsrch) { 1.2933 + return strsrch->search->matchedLength; 1.2934 + } 1.2935 + return USEARCH_DONE; 1.2936 +} 1.2937 + 1.2938 +#if !UCONFIG_NO_BREAK_ITERATION 1.2939 + 1.2940 +U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 1.2941 + UBreakIterator *breakiter, 1.2942 + UErrorCode *status) 1.2943 +{ 1.2944 + if (U_SUCCESS(*status) && strsrch) { 1.2945 + strsrch->search->breakIter = breakiter; 1.2946 + if (breakiter) { 1.2947 + ubrk_setText(breakiter, strsrch->search->text, 1.2948 + strsrch->search->textLength, status); 1.2949 + } 1.2950 + } 1.2951 +} 1.2952 + 1.2953 +U_CAPI const UBreakIterator* U_EXPORT2 1.2954 +usearch_getBreakIterator(const UStringSearch *strsrch) 1.2955 +{ 1.2956 + if (strsrch) { 1.2957 + return strsrch->search->breakIter; 1.2958 + } 1.2959 + return NULL; 1.2960 +} 1.2961 + 1.2962 +#endif 1.2963 + 1.2964 +U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 1.2965 + const UChar *text, 1.2966 + int32_t textlength, 1.2967 + UErrorCode *status) 1.2968 +{ 1.2969 + if (U_SUCCESS(*status)) { 1.2970 + if (strsrch == NULL || text == NULL || textlength < -1 || 1.2971 + textlength == 0) { 1.2972 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2973 + } 1.2974 + else { 1.2975 + if (textlength == -1) { 1.2976 + textlength = u_strlen(text); 1.2977 + } 1.2978 + strsrch->search->text = text; 1.2979 + strsrch->search->textLength = textlength; 1.2980 + ucol_setText(strsrch->textIter, text, textlength, status); 1.2981 + strsrch->search->matchedIndex = USEARCH_DONE; 1.2982 + strsrch->search->matchedLength = 0; 1.2983 + strsrch->search->reset = TRUE; 1.2984 +#if !UCONFIG_NO_BREAK_ITERATION 1.2985 + if (strsrch->search->breakIter != NULL) { 1.2986 + ubrk_setText(strsrch->search->breakIter, text, 1.2987 + textlength, status); 1.2988 + } 1.2989 + ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 1.2990 +#endif 1.2991 + } 1.2992 + } 1.2993 +} 1.2994 + 1.2995 +U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 1.2996 + int32_t *length) 1.2997 +{ 1.2998 + if (strsrch) { 1.2999 + *length = strsrch->search->textLength; 1.3000 + return strsrch->search->text; 1.3001 + } 1.3002 + return NULL; 1.3003 +} 1.3004 + 1.3005 +U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 1.3006 + const UCollator *collator, 1.3007 + UErrorCode *status) 1.3008 +{ 1.3009 + if (U_SUCCESS(*status)) { 1.3010 + if (collator == NULL) { 1.3011 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.3012 + return; 1.3013 + } 1.3014 + 1.3015 + if (strsrch) { 1.3016 + if (strsrch->ownCollator && (strsrch->collator != collator)) { 1.3017 + ucol_close((UCollator *)strsrch->collator); 1.3018 + strsrch->ownCollator = FALSE; 1.3019 + } 1.3020 + strsrch->collator = collator; 1.3021 + strsrch->strength = ucol_getStrength(collator); 1.3022 + strsrch->ceMask = getMask(strsrch->strength); 1.3023 +#if !UCONFIG_NO_BREAK_ITERATION 1.3024 + ubrk_close(strsrch->search->internalBreakIter); 1.3025 + strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 1.3026 + strsrch->search->text, strsrch->search->textLength, status); 1.3027 +#endif 1.3028 + // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 1.3029 + strsrch->toShift = 1.3030 + ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 1.3031 + UCOL_SHIFTED; 1.3032 + // if status is a failure, ucol_getVariableTop returns 0 1.3033 + strsrch->variableTop = ucol_getVariableTop(collator, status); 1.3034 + if (U_SUCCESS(*status)) { 1.3035 + initialize(strsrch, status); 1.3036 + if (U_SUCCESS(*status)) { 1.3037 + /* free offset buffer to avoid memory leak before initializing. */ 1.3038 + ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 1.3039 + uprv_init_collIterate(collator, strsrch->search->text, 1.3040 + strsrch->search->textLength, 1.3041 + &(strsrch->textIter->iteratordata_), 1.3042 + status); 1.3043 + strsrch->utilIter->iteratordata_.coll = collator; 1.3044 + } 1.3045 + } 1.3046 + } 1.3047 + 1.3048 + // **** are these calls needed? 1.3049 + // **** we call uprv_init_pce in initializePatternPCETable 1.3050 + // **** and the CEBuffer constructor... 1.3051 +#if 0 1.3052 + uprv_init_pce(strsrch->textIter); 1.3053 + uprv_init_pce(strsrch->utilIter); 1.3054 +#endif 1.3055 + } 1.3056 +} 1.3057 + 1.3058 +U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 1.3059 +{ 1.3060 + if (strsrch) { 1.3061 + return (UCollator *)strsrch->collator; 1.3062 + } 1.3063 + return NULL; 1.3064 +} 1.3065 + 1.3066 +U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 1.3067 + const UChar *pattern, 1.3068 + int32_t patternlength, 1.3069 + UErrorCode *status) 1.3070 +{ 1.3071 + if (U_SUCCESS(*status)) { 1.3072 + if (strsrch == NULL || pattern == NULL) { 1.3073 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.3074 + } 1.3075 + else { 1.3076 + if (patternlength == -1) { 1.3077 + patternlength = u_strlen(pattern); 1.3078 + } 1.3079 + if (patternlength == 0) { 1.3080 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.3081 + return; 1.3082 + } 1.3083 + strsrch->pattern.text = pattern; 1.3084 + strsrch->pattern.textLength = patternlength; 1.3085 + initialize(strsrch, status); 1.3086 + } 1.3087 + } 1.3088 +} 1.3089 + 1.3090 +U_CAPI const UChar* U_EXPORT2 1.3091 +usearch_getPattern(const UStringSearch *strsrch, 1.3092 + int32_t *length) 1.3093 +{ 1.3094 + if (strsrch) { 1.3095 + *length = strsrch->pattern.textLength; 1.3096 + return strsrch->pattern.text; 1.3097 + } 1.3098 + return NULL; 1.3099 +} 1.3100 + 1.3101 +// miscellanous methods -------------------------------------------------- 1.3102 + 1.3103 +U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 1.3104 + UErrorCode *status) 1.3105 +{ 1.3106 + if (strsrch && U_SUCCESS(*status)) { 1.3107 + strsrch->search->isForwardSearching = TRUE; 1.3108 + usearch_setOffset(strsrch, 0, status); 1.3109 + if (U_SUCCESS(*status)) { 1.3110 + return usearch_next(strsrch, status); 1.3111 + } 1.3112 + } 1.3113 + return USEARCH_DONE; 1.3114 +} 1.3115 + 1.3116 +U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 1.3117 + int32_t position, 1.3118 + UErrorCode *status) 1.3119 +{ 1.3120 + if (strsrch && U_SUCCESS(*status)) { 1.3121 + strsrch->search->isForwardSearching = TRUE; 1.3122 + // position checked in usearch_setOffset 1.3123 + usearch_setOffset(strsrch, position, status); 1.3124 + if (U_SUCCESS(*status)) { 1.3125 + return usearch_next(strsrch, status); 1.3126 + } 1.3127 + } 1.3128 + return USEARCH_DONE; 1.3129 +} 1.3130 + 1.3131 +U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 1.3132 + UErrorCode *status) 1.3133 +{ 1.3134 + if (strsrch && U_SUCCESS(*status)) { 1.3135 + strsrch->search->isForwardSearching = FALSE; 1.3136 + usearch_setOffset(strsrch, strsrch->search->textLength, status); 1.3137 + if (U_SUCCESS(*status)) { 1.3138 + return usearch_previous(strsrch, status); 1.3139 + } 1.3140 + } 1.3141 + return USEARCH_DONE; 1.3142 +} 1.3143 + 1.3144 +U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 1.3145 + int32_t position, 1.3146 + UErrorCode *status) 1.3147 +{ 1.3148 + if (strsrch && U_SUCCESS(*status)) { 1.3149 + strsrch->search->isForwardSearching = FALSE; 1.3150 + // position checked in usearch_setOffset 1.3151 + usearch_setOffset(strsrch, position, status); 1.3152 + if (U_SUCCESS(*status)) { 1.3153 + return usearch_previous(strsrch, status); 1.3154 + } 1.3155 + } 1.3156 + return USEARCH_DONE; 1.3157 +} 1.3158 + 1.3159 +/** 1.3160 +* If a direction switch is required, we'll count the number of ces till the 1.3161 +* beginning of the collation element iterator and iterate forwards that 1.3162 +* number of times. This is so that we get to the correct point within the 1.3163 +* string to continue the search in. Imagine when we are in the middle of the 1.3164 +* normalization buffer when the change in direction is request. arrrgghh.... 1.3165 +* After searching the offset within the collation element iterator will be 1.3166 +* shifted to the start of the match. If a match is not found, the offset would 1.3167 +* have been set to the end of the text string in the collation element 1.3168 +* iterator. 1.3169 +* Okay, here's my take on normalization buffer. The only time when there can 1.3170 +* be 2 matches within the same normalization is when the pattern is consists 1.3171 +* of all accents. But since the offset returned is from the text string, we 1.3172 +* should not confuse the caller by returning the second match within the 1.3173 +* same normalization buffer. If we do, the 2 results will have the same match 1.3174 +* offsets, and that'll be confusing. I'll return the next match that doesn't 1.3175 +* fall within the same normalization buffer. Note this does not affect the 1.3176 +* results of matches spanning the text and the normalization buffer. 1.3177 +* The position to start searching is taken from the collation element 1.3178 +* iterator. Callers of this API would have to set the offset in the collation 1.3179 +* element iterator before using this method. 1.3180 +*/ 1.3181 +U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 1.3182 + UErrorCode *status) 1.3183 +{ 1.3184 + if (U_SUCCESS(*status) && strsrch) { 1.3185 + // note offset is either equivalent to the start of the previous match 1.3186 + // or is set by the user 1.3187 + int32_t offset = usearch_getOffset(strsrch); 1.3188 + USearch *search = strsrch->search; 1.3189 + search->reset = FALSE; 1.3190 + int32_t textlength = search->textLength; 1.3191 + if (search->isForwardSearching) { 1.3192 +#if BOYER_MOORE 1.3193 + if (offset == textlength 1.3194 + || (!search->isOverlap && 1.3195 + (offset + strsrch->pattern.defaultShiftSize > textlength || 1.3196 + (search->matchedIndex != USEARCH_DONE && 1.3197 + offset + search->matchedLength >= textlength)))) { 1.3198 + // not enough characters to match 1.3199 + setMatchNotFound(strsrch); 1.3200 + return USEARCH_DONE; 1.3201 + } 1.3202 +#else 1.3203 + if (offset == textlength || 1.3204 + (! search->isOverlap && 1.3205 + (search->matchedIndex != USEARCH_DONE && 1.3206 + offset + search->matchedLength > textlength))) { 1.3207 + // not enough characters to match 1.3208 + setMatchNotFound(strsrch); 1.3209 + return USEARCH_DONE; 1.3210 + } 1.3211 +#endif 1.3212 + } 1.3213 + else { 1.3214 + // switching direction. 1.3215 + // if matchedIndex == USEARCH_DONE, it means that either a 1.3216 + // setOffset has been called or that previous ran off the text 1.3217 + // string. the iterator would have been set to offset 0 if a 1.3218 + // match is not found. 1.3219 + search->isForwardSearching = TRUE; 1.3220 + if (search->matchedIndex != USEARCH_DONE) { 1.3221 + // there's no need to set the collation element iterator 1.3222 + // the next call to next will set the offset. 1.3223 + return search->matchedIndex; 1.3224 + } 1.3225 + } 1.3226 + 1.3227 + if (U_SUCCESS(*status)) { 1.3228 + if (strsrch->pattern.CELength == 0) { 1.3229 + if (search->matchedIndex == USEARCH_DONE) { 1.3230 + search->matchedIndex = offset; 1.3231 + } 1.3232 + else { // moves by codepoints 1.3233 + U16_FWD_1(search->text, search->matchedIndex, textlength); 1.3234 + } 1.3235 + 1.3236 + search->matchedLength = 0; 1.3237 + setColEIterOffset(strsrch->textIter, search->matchedIndex); 1.3238 + // status checked below 1.3239 + if (search->matchedIndex == textlength) { 1.3240 + search->matchedIndex = USEARCH_DONE; 1.3241 + } 1.3242 + } 1.3243 + else { 1.3244 + if (search->matchedLength > 0) { 1.3245 + // if matchlength is 0 we are at the start of the iteration 1.3246 + if (search->isOverlap) { 1.3247 + ucol_setOffset(strsrch->textIter, offset + 1, status); 1.3248 + } 1.3249 + else { 1.3250 + ucol_setOffset(strsrch->textIter, 1.3251 + offset + search->matchedLength, status); 1.3252 + } 1.3253 + } 1.3254 + else { 1.3255 + // for boundary check purposes. this will ensure that the 1.3256 + // next match will not preceed the current offset 1.3257 + // note search->matchedIndex will always be set to something 1.3258 + // in the code 1.3259 + search->matchedIndex = offset - 1; 1.3260 + } 1.3261 + 1.3262 + if (search->isCanonicalMatch) { 1.3263 + // can't use exact here since extra accents are allowed. 1.3264 + usearch_handleNextCanonical(strsrch, status); 1.3265 + } 1.3266 + else { 1.3267 + usearch_handleNextExact(strsrch, status); 1.3268 + } 1.3269 + } 1.3270 + 1.3271 + if (U_FAILURE(*status)) { 1.3272 + return USEARCH_DONE; 1.3273 + } 1.3274 + 1.3275 +#if !BOYER_MOORE 1.3276 + if (search->matchedIndex == USEARCH_DONE) { 1.3277 + ucol_setOffset(strsrch->textIter, search->textLength, status); 1.3278 + } else { 1.3279 + ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 1.3280 + } 1.3281 +#endif 1.3282 + 1.3283 + return search->matchedIndex; 1.3284 + } 1.3285 + } 1.3286 + return USEARCH_DONE; 1.3287 +} 1.3288 + 1.3289 +U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 1.3290 + UErrorCode *status) 1.3291 +{ 1.3292 + if (U_SUCCESS(*status) && strsrch) { 1.3293 + int32_t offset; 1.3294 + USearch *search = strsrch->search; 1.3295 + if (search->reset) { 1.3296 + offset = search->textLength; 1.3297 + search->isForwardSearching = FALSE; 1.3298 + search->reset = FALSE; 1.3299 + setColEIterOffset(strsrch->textIter, offset); 1.3300 + } 1.3301 + else { 1.3302 + offset = usearch_getOffset(strsrch); 1.3303 + } 1.3304 + 1.3305 + int32_t matchedindex = search->matchedIndex; 1.3306 + if (search->isForwardSearching == TRUE) { 1.3307 + // switching direction. 1.3308 + // if matchedIndex == USEARCH_DONE, it means that either a 1.3309 + // setOffset has been called or that next ran off the text 1.3310 + // string. the iterator would have been set to offset textLength if 1.3311 + // a match is not found. 1.3312 + search->isForwardSearching = FALSE; 1.3313 + if (matchedindex != USEARCH_DONE) { 1.3314 + return matchedindex; 1.3315 + } 1.3316 + } 1.3317 + else { 1.3318 +#if BOYER_MOORE 1.3319 + if (offset == 0 || matchedindex == 0 || 1.3320 + (!search->isOverlap && 1.3321 + (offset < strsrch->pattern.defaultShiftSize || 1.3322 + (matchedindex != USEARCH_DONE && 1.3323 + matchedindex < strsrch->pattern.defaultShiftSize)))) { 1.3324 + // not enough characters to match 1.3325 + setMatchNotFound(strsrch); 1.3326 + return USEARCH_DONE; 1.3327 + } 1.3328 +#else 1.3329 + // Could check pattern length, but the 1.3330 + // linear search will do the right thing 1.3331 + if (offset == 0 || matchedindex == 0) { 1.3332 + setMatchNotFound(strsrch); 1.3333 + return USEARCH_DONE; 1.3334 + } 1.3335 +#endif 1.3336 + } 1.3337 + 1.3338 + if (U_SUCCESS(*status)) { 1.3339 + if (strsrch->pattern.CELength == 0) { 1.3340 + search->matchedIndex = 1.3341 + (matchedindex == USEARCH_DONE ? offset : matchedindex); 1.3342 + if (search->matchedIndex == 0) { 1.3343 + setMatchNotFound(strsrch); 1.3344 + // status checked below 1.3345 + } 1.3346 + else { // move by codepoints 1.3347 + U16_BACK_1(search->text, 0, search->matchedIndex); 1.3348 + setColEIterOffset(strsrch->textIter, search->matchedIndex); 1.3349 + // status checked below 1.3350 + search->matchedLength = 0; 1.3351 + } 1.3352 + } 1.3353 + else { 1.3354 + if (strsrch->search->isCanonicalMatch) { 1.3355 + // can't use exact here since extra accents are allowed. 1.3356 + usearch_handlePreviousCanonical(strsrch, status); 1.3357 + // status checked below 1.3358 + } 1.3359 + else { 1.3360 + usearch_handlePreviousExact(strsrch, status); 1.3361 + // status checked below 1.3362 + } 1.3363 + } 1.3364 + 1.3365 + if (U_FAILURE(*status)) { 1.3366 + return USEARCH_DONE; 1.3367 + } 1.3368 + 1.3369 + return search->matchedIndex; 1.3370 + } 1.3371 + } 1.3372 + return USEARCH_DONE; 1.3373 +} 1.3374 + 1.3375 + 1.3376 + 1.3377 +U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 1.3378 +{ 1.3379 + /* 1.3380 + reset is setting the attributes that are already in 1.3381 + string search, hence all attributes in the collator should 1.3382 + be retrieved without any problems 1.3383 + */ 1.3384 + if (strsrch) { 1.3385 + UErrorCode status = U_ZERO_ERROR; 1.3386 + UBool sameCollAttribute = TRUE; 1.3387 + uint32_t ceMask; 1.3388 + UBool shift; 1.3389 + uint32_t varTop; 1.3390 + 1.3391 + // **** hack to deal w/ how processed CEs encode quaternary **** 1.3392 + UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 1.3393 + if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 1.3394 + (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 1.3395 + sameCollAttribute = FALSE; 1.3396 + } 1.3397 + 1.3398 + strsrch->strength = ucol_getStrength(strsrch->collator); 1.3399 + ceMask = getMask(strsrch->strength); 1.3400 + if (strsrch->ceMask != ceMask) { 1.3401 + strsrch->ceMask = ceMask; 1.3402 + sameCollAttribute = FALSE; 1.3403 + } 1.3404 + 1.3405 + // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 1.3406 + shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 1.3407 + &status) == UCOL_SHIFTED; 1.3408 + if (strsrch->toShift != shift) { 1.3409 + strsrch->toShift = shift; 1.3410 + sameCollAttribute = FALSE; 1.3411 + } 1.3412 + 1.3413 + // if status is a failure, ucol_getVariableTop returns 0 1.3414 + varTop = ucol_getVariableTop(strsrch->collator, &status); 1.3415 + if (strsrch->variableTop != varTop) { 1.3416 + strsrch->variableTop = varTop; 1.3417 + sameCollAttribute = FALSE; 1.3418 + } 1.3419 + if (!sameCollAttribute) { 1.3420 + initialize(strsrch, &status); 1.3421 + } 1.3422 + /* free offset buffer to avoid memory leak before initializing. */ 1.3423 + ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); 1.3424 + uprv_init_collIterate(strsrch->collator, strsrch->search->text, 1.3425 + strsrch->search->textLength, 1.3426 + &(strsrch->textIter->iteratordata_), 1.3427 + &status); 1.3428 + strsrch->search->matchedLength = 0; 1.3429 + strsrch->search->matchedIndex = USEARCH_DONE; 1.3430 + strsrch->search->isOverlap = FALSE; 1.3431 + strsrch->search->isCanonicalMatch = FALSE; 1.3432 + strsrch->search->elementComparisonType = 0; 1.3433 + strsrch->search->isForwardSearching = TRUE; 1.3434 + strsrch->search->reset = TRUE; 1.3435 + } 1.3436 +} 1.3437 + 1.3438 +// 1.3439 +// CEI Collation Element + source text index. 1.3440 +// These structs are kept in the circular buffer. 1.3441 +// 1.3442 +struct CEI { 1.3443 + int64_t ce; 1.3444 + int32_t lowIndex; 1.3445 + int32_t highIndex; 1.3446 +}; 1.3447 + 1.3448 +U_NAMESPACE_BEGIN 1.3449 + 1.3450 + 1.3451 +// 1.3452 +// CEBuffer A circular buffer of CEs from the text being searched. 1.3453 +// 1.3454 +#define DEFAULT_CEBUFFER_SIZE 96 1.3455 +#define CEBUFFER_EXTRA 32 1.3456 +// Some typical max values to make buffer size more reasonable for asymmetric search. 1.3457 +// #8694 is for a better long-term solution to allocation of this buffer. 1.3458 +#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 1.3459 +#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 1.3460 +#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 1.3461 +struct CEBuffer { 1.3462 + CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 1.3463 + CEI *buf; 1.3464 + int32_t bufSize; 1.3465 + int32_t firstIx; 1.3466 + int32_t limitIx; 1.3467 + UCollationElements *ceIter; 1.3468 + UStringSearch *strSearch; 1.3469 + 1.3470 + 1.3471 + 1.3472 + CEBuffer(UStringSearch *ss, UErrorCode *status); 1.3473 + ~CEBuffer(); 1.3474 + const CEI *get(int32_t index); 1.3475 + const CEI *getPrevious(int32_t index); 1.3476 +}; 1.3477 + 1.3478 + 1.3479 +CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { 1.3480 + buf = defBuf; 1.3481 + strSearch = ss; 1.3482 + bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA; 1.3483 + if (ss->search->elementComparisonType != 0) { 1.3484 + const UChar * patText = ss->pattern.text; 1.3485 + if (patText) { 1.3486 + const UChar * patTextLimit = patText + ss->pattern.textLength; 1.3487 + while ( patText < patTextLimit ) { 1.3488 + UChar c = *patText++; 1.3489 + if (MIGHT_BE_JAMO_L(c)) { 1.3490 + bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 1.3491 + } else { 1.3492 + // No check for surrogates, we might allocate slightly more buffer than necessary. 1.3493 + bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 1.3494 + } 1.3495 + } 1.3496 + } 1.3497 + } 1.3498 + ceIter = ss->textIter; 1.3499 + firstIx = 0; 1.3500 + limitIx = 0; 1.3501 + 1.3502 + uprv_init_pce(ceIter); 1.3503 + 1.3504 + if (bufSize>DEFAULT_CEBUFFER_SIZE) { 1.3505 + buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 1.3506 + if (buf == NULL) { 1.3507 + *status = U_MEMORY_ALLOCATION_ERROR; 1.3508 + } 1.3509 + } 1.3510 +} 1.3511 + 1.3512 +// TODO: add a reset or init function so that allocated 1.3513 +// buffers can be retained & reused. 1.3514 + 1.3515 +CEBuffer::~CEBuffer() { 1.3516 + if (buf != defBuf) { 1.3517 + uprv_free(buf); 1.3518 + } 1.3519 +} 1.3520 + 1.3521 + 1.3522 +// Get the CE with the specified index. 1.3523 +// Index must be in the range 1.3524 +// n-history_size < index < n+1 1.3525 +// where n is the largest index to have been fetched by some previous call to this function. 1.3526 +// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1.3527 +// 1.3528 +const CEI *CEBuffer::get(int32_t index) { 1.3529 + int i = index % bufSize; 1.3530 + 1.3531 + if (index>=firstIx && index<limitIx) { 1.3532 + // The request was for an entry already in our buffer. 1.3533 + // Just return it. 1.3534 + return &buf[i]; 1.3535 + } 1.3536 + 1.3537 + // Caller is requesting a new, never accessed before, CE. 1.3538 + // Verify that it is the next one in sequence, which is all 1.3539 + // that is allowed. 1.3540 + if (index != limitIx) { 1.3541 + U_ASSERT(FALSE); 1.3542 + 1.3543 + return NULL; 1.3544 + } 1.3545 + 1.3546 + // Manage the circular CE buffer indexing 1.3547 + limitIx++; 1.3548 + 1.3549 + if (limitIx - firstIx >= bufSize) { 1.3550 + // The buffer is full, knock out the lowest-indexed entry. 1.3551 + firstIx++; 1.3552 + } 1.3553 + 1.3554 + UErrorCode status = U_ZERO_ERROR; 1.3555 + 1.3556 + buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 1.3557 + 1.3558 + return &buf[i]; 1.3559 +} 1.3560 + 1.3561 +// Get the CE with the specified index. 1.3562 +// Index must be in the range 1.3563 +// n-history_size < index < n+1 1.3564 +// where n is the largest index to have been fetched by some previous call to this function. 1.3565 +// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1.3566 +// 1.3567 +const CEI *CEBuffer::getPrevious(int32_t index) { 1.3568 + int i = index % bufSize; 1.3569 + 1.3570 + if (index>=firstIx && index<limitIx) { 1.3571 + // The request was for an entry already in our buffer. 1.3572 + // Just return it. 1.3573 + return &buf[i]; 1.3574 + } 1.3575 + 1.3576 + // Caller is requesting a new, never accessed before, CE. 1.3577 + // Verify that it is the next one in sequence, which is all 1.3578 + // that is allowed. 1.3579 + if (index != limitIx) { 1.3580 + U_ASSERT(FALSE); 1.3581 + 1.3582 + return NULL; 1.3583 + } 1.3584 + 1.3585 + // Manage the circular CE buffer indexing 1.3586 + limitIx++; 1.3587 + 1.3588 + if (limitIx - firstIx >= bufSize) { 1.3589 + // The buffer is full, knock out the lowest-indexed entry. 1.3590 + firstIx++; 1.3591 + } 1.3592 + 1.3593 + UErrorCode status = U_ZERO_ERROR; 1.3594 + 1.3595 + buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); 1.3596 + 1.3597 + return &buf[i]; 1.3598 +} 1.3599 + 1.3600 +U_NAMESPACE_END 1.3601 + 1.3602 + 1.3603 +// #define USEARCH_DEBUG 1.3604 + 1.3605 +#ifdef USEARCH_DEBUG 1.3606 +#include <stdio.h> 1.3607 +#include <stdlib.h> 1.3608 +#endif 1.3609 + 1.3610 +/* 1.3611 + * Find the next break boundary after startIndex. If the UStringSearch object 1.3612 + * has an external break iterator, use that. Otherwise use the internal character 1.3613 + * break iterator. 1.3614 + */ 1.3615 +static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 1.3616 +#if 0 1.3617 + const UChar *text = strsrch->search->text; 1.3618 + int32_t textLen = strsrch->search->textLength; 1.3619 + 1.3620 + U_ASSERT(startIndex>=0); 1.3621 + U_ASSERT(startIndex<=textLen); 1.3622 + 1.3623 + if (startIndex >= textLen) { 1.3624 + return startIndex; 1.3625 + } 1.3626 + 1.3627 + UChar32 c; 1.3628 + int32_t i = startIndex; 1.3629 + U16_NEXT(text, i, textLen, c); 1.3630 + 1.3631 + // If we are on a control character, stop without looking for combining marks. 1.3632 + // Control characters do not combine. 1.3633 + int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1.3634 + if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 1.3635 + return i; 1.3636 + } 1.3637 + 1.3638 + // The initial character was not a control, and can thus accept trailing 1.3639 + // combining characters. Advance over however many of them there are. 1.3640 + int32_t indexOfLastCharChecked; 1.3641 + for (;;) { 1.3642 + indexOfLastCharChecked = i; 1.3643 + if (i>=textLen) { 1.3644 + break; 1.3645 + } 1.3646 + U16_NEXT(text, i, textLen, c); 1.3647 + gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1.3648 + if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1.3649 + break; 1.3650 + } 1.3651 + } 1.3652 + return indexOfLastCharChecked; 1.3653 +#elif !UCONFIG_NO_BREAK_ITERATION 1.3654 + UBreakIterator *breakiterator = strsrch->search->breakIter; 1.3655 + 1.3656 + if (breakiterator == NULL) { 1.3657 + breakiterator = strsrch->search->internalBreakIter; 1.3658 + } 1.3659 + 1.3660 + if (breakiterator != NULL) { 1.3661 + return ubrk_following(breakiterator, startIndex); 1.3662 + } 1.3663 + 1.3664 + return startIndex; 1.3665 +#else 1.3666 + // **** or should we use the original code? **** 1.3667 + return startIndex; 1.3668 +#endif 1.3669 + 1.3670 +} 1.3671 + 1.3672 +/* 1.3673 + * Returns TRUE if index is on a break boundary. If the UStringSearch 1.3674 + * has an external break iterator, test using that, otherwise test 1.3675 + * using the internal character break iterator. 1.3676 + */ 1.3677 +static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 1.3678 +#if 0 1.3679 + const UChar *text = strsrch->search->text; 1.3680 + int32_t textLen = strsrch->search->textLength; 1.3681 + 1.3682 + U_ASSERT(index>=0); 1.3683 + U_ASSERT(index<=textLen); 1.3684 + 1.3685 + if (index>=textLen || index<=0) { 1.3686 + return TRUE; 1.3687 + } 1.3688 + 1.3689 + // If the character at the current index is not a GRAPHEME_EXTEND 1.3690 + // then we can not be within a combining sequence. 1.3691 + UChar32 c; 1.3692 + U16_GET(text, 0, index, textLen, c); 1.3693 + int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1.3694 + if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1.3695 + return TRUE; 1.3696 + } 1.3697 + 1.3698 + // We are at a combining mark. If the preceding character is anything 1.3699 + // except a CONTROL, CR or LF, we are in a combining sequence. 1.3700 + U16_PREV(text, 0, index, c); 1.3701 + gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1.3702 + UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 1.3703 + return !combining; 1.3704 +#elif !UCONFIG_NO_BREAK_ITERATION 1.3705 + UBreakIterator *breakiterator = strsrch->search->breakIter; 1.3706 + 1.3707 + if (breakiterator == NULL) { 1.3708 + breakiterator = strsrch->search->internalBreakIter; 1.3709 + } 1.3710 + 1.3711 + return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 1.3712 +#else 1.3713 + // **** or use the original code? **** 1.3714 + return TRUE; 1.3715 +#endif 1.3716 +} 1.3717 + 1.3718 +#if 0 1.3719 +static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 1.3720 +{ 1.3721 +#if !UCONFIG_NO_BREAK_ITERATION 1.3722 + UBreakIterator *breakiterator = strsrch->search->breakIter; 1.3723 + 1.3724 + if (breakiterator != NULL) { 1.3725 + int32_t startindex = ubrk_first(breakiterator); 1.3726 + int32_t endindex = ubrk_last(breakiterator); 1.3727 + 1.3728 + // out-of-range indexes are never boundary positions 1.3729 + if (start < startindex || start > endindex || 1.3730 + end < startindex || end > endindex) { 1.3731 + return FALSE; 1.3732 + } 1.3733 + 1.3734 + return ubrk_isBoundary(breakiterator, start) && 1.3735 + ubrk_isBoundary(breakiterator, end); 1.3736 + } 1.3737 +#endif 1.3738 + 1.3739 + return TRUE; 1.3740 +} 1.3741 +#endif 1.3742 + 1.3743 +typedef enum { 1.3744 + U_CE_MATCH = -1, 1.3745 + U_CE_NO_MATCH = 0, 1.3746 + U_CE_SKIP_TARG, 1.3747 + U_CE_SKIP_PATN 1.3748 +} UCompareCEsResult; 1.3749 +#define U_CE_LEVEL2_BASE 0x00000005 1.3750 +#define U_CE_LEVEL3_BASE 0x00050000 1.3751 + 1.3752 +static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 1.3753 + if (targCE == patCE) { 1.3754 + return U_CE_MATCH; 1.3755 + } 1.3756 + if (compareType == 0) { 1.3757 + return U_CE_NO_MATCH; 1.3758 + } 1.3759 + 1.3760 + int64_t targCEshifted = targCE >> 32; 1.3761 + int64_t patCEshifted = patCE >> 32; 1.3762 + int64_t mask; 1.3763 + 1.3764 + mask = 0xFFFF0000; 1.3765 + int32_t targLev1 = (int32_t)(targCEshifted & mask); 1.3766 + int32_t patLev1 = (int32_t)(patCEshifted & mask); 1.3767 + if ( targLev1 != patLev1 ) { 1.3768 + if ( targLev1 == 0 ) { 1.3769 + return U_CE_SKIP_TARG; 1.3770 + } 1.3771 + if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 1.3772 + return U_CE_SKIP_PATN; 1.3773 + } 1.3774 + return U_CE_NO_MATCH; 1.3775 + } 1.3776 + 1.3777 + mask = 0x0000FFFF; 1.3778 + int32_t targLev2 = (int32_t)(targCEshifted & mask); 1.3779 + int32_t patLev2 = (int32_t)(patCEshifted & mask); 1.3780 + if ( targLev2 != patLev2 ) { 1.3781 + if ( targLev2 == 0 ) { 1.3782 + return U_CE_SKIP_TARG; 1.3783 + } 1.3784 + if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 1.3785 + return U_CE_SKIP_PATN; 1.3786 + } 1.3787 + return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 1.3788 + U_CE_MATCH: U_CE_NO_MATCH; 1.3789 + } 1.3790 + 1.3791 + mask = 0xFFFF0000; 1.3792 + int32_t targLev3 = (int32_t)(targCE & mask); 1.3793 + int32_t patLev3 = (int32_t)(patCE & mask); 1.3794 + if ( targLev3 != patLev3 ) { 1.3795 + return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 1.3796 + U_CE_MATCH: U_CE_NO_MATCH; 1.3797 + } 1.3798 + 1.3799 + return U_CE_MATCH; 1.3800 +} 1.3801 + 1.3802 +#if BOYER_MOORE 1.3803 +// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 1.3804 +#endif 1.3805 + 1.3806 +U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 1.3807 + int32_t startIdx, 1.3808 + int32_t *matchStart, 1.3809 + int32_t *matchLimit, 1.3810 + UErrorCode *status) 1.3811 +{ 1.3812 + if (U_FAILURE(*status)) { 1.3813 + return FALSE; 1.3814 + } 1.3815 + 1.3816 + // TODO: reject search patterns beginning with a combining char. 1.3817 + 1.3818 +#ifdef USEARCH_DEBUG 1.3819 + if (getenv("USEARCH_DEBUG") != NULL) { 1.3820 + printf("Pattern CEs\n"); 1.3821 + for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 1.3822 + printf(" %8x", strsrch->pattern.CE[ii]); 1.3823 + } 1.3824 + printf("\n"); 1.3825 + } 1.3826 + 1.3827 +#endif 1.3828 + // Input parameter sanity check. 1.3829 + // TODO: should input indicies clip to the text length 1.3830 + // in the same way that UText does. 1.3831 + if(strsrch->pattern.CELength == 0 || 1.3832 + startIdx < 0 || 1.3833 + startIdx > strsrch->search->textLength || 1.3834 + strsrch->pattern.CE == NULL) { 1.3835 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.3836 + return FALSE; 1.3837 + } 1.3838 + 1.3839 + if (strsrch->pattern.PCE == NULL) { 1.3840 + initializePatternPCETable(strsrch, status); 1.3841 + } 1.3842 + 1.3843 + ucol_setOffset(strsrch->textIter, startIdx, status); 1.3844 + CEBuffer ceb(strsrch, status); 1.3845 + 1.3846 + 1.3847 + int32_t targetIx = 0; 1.3848 + const CEI *targetCEI = NULL; 1.3849 + int32_t patIx; 1.3850 + UBool found; 1.3851 + 1.3852 + int32_t mStart = -1; 1.3853 + int32_t mLimit = -1; 1.3854 + int32_t minLimit; 1.3855 + int32_t maxLimit; 1.3856 + 1.3857 + 1.3858 + 1.3859 + // Outer loop moves over match starting positions in the 1.3860 + // target CE space. 1.3861 + // Here we see the target as a sequence of collation elements, resulting from the following: 1.3862 + // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 1.3863 + // (for example, digraphs such as IJ may be broken into two characters). 1.3864 + // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 1.3865 + // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 1.3866 + // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 1.3867 + // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 1.3868 + // then the CE is deleted, so the following code sees only CEs that are relevant. 1.3869 + // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 1.3870 + // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 1.3871 + // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 1.3872 + // 1.3873 + for(targetIx=0; ; targetIx++) 1.3874 + { 1.3875 + found = TRUE; 1.3876 + // Inner loop checks for a match beginning at each 1.3877 + // position from the outer loop. 1.3878 + int32_t targetIxOffset = 0; 1.3879 + int64_t patCE = 0; 1.3880 + // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 1.3881 + // (compared to the last CE fetched for the previous targetIx value) as we need to go 1.3882 + // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 1.3883 + const CEI *firstCEI = ceb.get(targetIx); 1.3884 + if (firstCEI == NULL) { 1.3885 + *status = U_INTERNAL_PROGRAM_ERROR; 1.3886 + found = FALSE; 1.3887 + break; 1.3888 + } 1.3889 + 1.3890 + for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) { 1.3891 + patCE = strsrch->pattern.PCE[patIx]; 1.3892 + targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 1.3893 + // Compare CE from target string with CE from the pattern. 1.3894 + // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 1.3895 + // which will fail the compare, below. 1.3896 + UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 1.3897 + if ( ceMatch == U_CE_NO_MATCH ) { 1.3898 + found = FALSE; 1.3899 + break; 1.3900 + } else if ( ceMatch > U_CE_NO_MATCH ) { 1.3901 + if ( ceMatch == U_CE_SKIP_TARG ) { 1.3902 + // redo with same patCE, next targCE 1.3903 + patIx--; 1.3904 + targetIxOffset++; 1.3905 + } else { // ceMatch == U_CE_SKIP_PATN 1.3906 + // redo with same targCE, next patCE 1.3907 + targetIxOffset--; 1.3908 + } 1.3909 + } 1.3910 + } 1.3911 + targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far 1.3912 + 1.3913 + if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 1.3914 + // No match at this targetIx. Try again at the next. 1.3915 + continue; 1.3916 + } 1.3917 + 1.3918 + if (!found) { 1.3919 + // No match at all, we have run off the end of the target text. 1.3920 + break; 1.3921 + } 1.3922 + 1.3923 + 1.3924 + // We have found a match in CE space. 1.3925 + // Now determine the bounds in string index space. 1.3926 + // There still is a chance of match failure if the CE range not correspond to 1.3927 + // an acceptable character range. 1.3928 + // 1.3929 + const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 1.3930 + 1.3931 + mStart = firstCEI->lowIndex; 1.3932 + minLimit = lastCEI->lowIndex; 1.3933 + 1.3934 + // Look at the CE following the match. If it is UCOL_NULLORDER the match 1.3935 + // extended to the end of input, and the match is good. 1.3936 + 1.3937 + // Look at the high and low indices of the CE following the match. If 1.3938 + // they are the same it means one of two things: 1.3939 + // 1. The match extended to the last CE from the target text, which is OK, or 1.3940 + // 2. The last CE that was part of the match is in an expansion that extends 1.3941 + // to the first CE after the match. In this case, we reject the match. 1.3942 + const CEI *nextCEI = 0; 1.3943 + if (strsrch->search->elementComparisonType == 0) { 1.3944 + nextCEI = ceb.get(targetIx + targetIxOffset); 1.3945 + maxLimit = nextCEI->lowIndex; 1.3946 + if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 1.3947 + found = FALSE; 1.3948 + } 1.3949 + } else { 1.3950 + for ( ; ; ++targetIxOffset ) { 1.3951 + nextCEI = ceb.get(targetIx + targetIxOffset); 1.3952 + maxLimit = nextCEI->lowIndex; 1.3953 + // If we are at the end of the target too, match succeeds 1.3954 + if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 1.3955 + break; 1.3956 + } 1.3957 + // As long as the next CE has primary weight of 0, 1.3958 + // it is part of the last target element matched by the pattern; 1.3959 + // make sure it can be part of a match with the last patCE 1.3960 + if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 1.3961 + UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 1.3962 + if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 1.3963 + found = FALSE; 1.3964 + break; 1.3965 + } 1.3966 + // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 1.3967 + // target element, but it has non-zero primary weight => match fails 1.3968 + } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 1.3969 + found = false; 1.3970 + break; 1.3971 + // Else the target CE is not part of an expansion of the last matched element, match succeeds 1.3972 + } else { 1.3973 + break; 1.3974 + } 1.3975 + } 1.3976 + } 1.3977 + 1.3978 + 1.3979 + // Check for the start of the match being within a combining sequence. 1.3980 + // This can happen if the pattern itself begins with a combining char, and 1.3981 + // the match found combining marks in the target text that were attached 1.3982 + // to something else. 1.3983 + // This type of match should be rejected for not completely consuming a 1.3984 + // combining sequence. 1.3985 + if (!isBreakBoundary(strsrch, mStart)) { 1.3986 + found = FALSE; 1.3987 + } 1.3988 + 1.3989 + // Check for the start of the match being within an Collation Element Expansion, 1.3990 + // meaning that the first char of the match is only partially matched. 1.3991 + // With exapnsions, the first CE will report the index of the source 1.3992 + // character, and all subsequent (expansions) CEs will report the source index of the 1.3993 + // _following_ character. 1.3994 + int32_t secondIx = firstCEI->highIndex; 1.3995 + if (mStart == secondIx) { 1.3996 + found = FALSE; 1.3997 + } 1.3998 + 1.3999 + // Advance the match end position to the first acceptable match boundary. 1.4000 + // This advances the index over any combining charcters. 1.4001 + mLimit = maxLimit; 1.4002 + if (minLimit < maxLimit) { 1.4003 + // When the last CE's low index is same with its high index, the CE is likely 1.4004 + // a part of expansion. In this case, the index is located just after the 1.4005 + // character corresponding to the CEs compared above. If the index is right 1.4006 + // at the break boundary, move the position to the next boundary will result 1.4007 + // incorrect match length when there are ignorable characters exist between 1.4008 + // the position and the next character produces CE(s). See ticket#8482. 1.4009 + if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 1.4010 + mLimit = minLimit; 1.4011 + } else { 1.4012 + int32_t nba = nextBoundaryAfter(strsrch, minLimit); 1.4013 + if (nba >= lastCEI->highIndex) { 1.4014 + mLimit = nba; 1.4015 + } 1.4016 + } 1.4017 + } 1.4018 + 1.4019 + #ifdef USEARCH_DEBUG 1.4020 + if (getenv("USEARCH_DEBUG") != NULL) { 1.4021 + printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 1.4022 + } 1.4023 + #endif 1.4024 + 1.4025 + // If advancing to the end of a combining sequence in character indexing space 1.4026 + // advanced us beyond the end of the match in CE space, reject this match. 1.4027 + if (mLimit > maxLimit) { 1.4028 + found = FALSE; 1.4029 + } 1.4030 + 1.4031 + if (!isBreakBoundary(strsrch, mLimit)) { 1.4032 + found = FALSE; 1.4033 + } 1.4034 + 1.4035 + if (! checkIdentical(strsrch, mStart, mLimit)) { 1.4036 + found = FALSE; 1.4037 + } 1.4038 + 1.4039 + if (found) { 1.4040 + break; 1.4041 + } 1.4042 + } 1.4043 + 1.4044 + #ifdef USEARCH_DEBUG 1.4045 + if (getenv("USEARCH_DEBUG") != NULL) { 1.4046 + printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 1.4047 + int32_t lastToPrint = ceb.limitIx+2; 1.4048 + for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 1.4049 + printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 1.4050 + } 1.4051 + printf("\n%s\n", found? "match found" : "no match"); 1.4052 + } 1.4053 + #endif 1.4054 + 1.4055 + // All Done. Store back the match bounds to the caller. 1.4056 + // 1.4057 + if (found==FALSE) { 1.4058 + mLimit = -1; 1.4059 + mStart = -1; 1.4060 + } 1.4061 + 1.4062 + if (matchStart != NULL) { 1.4063 + *matchStart= mStart; 1.4064 + } 1.4065 + 1.4066 + if (matchLimit != NULL) { 1.4067 + *matchLimit = mLimit; 1.4068 + } 1.4069 + 1.4070 + return found; 1.4071 +} 1.4072 + 1.4073 +U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 1.4074 + int32_t startIdx, 1.4075 + int32_t *matchStart, 1.4076 + int32_t *matchLimit, 1.4077 + UErrorCode *status) 1.4078 +{ 1.4079 + if (U_FAILURE(*status)) { 1.4080 + return FALSE; 1.4081 + } 1.4082 + 1.4083 + // TODO: reject search patterns beginning with a combining char. 1.4084 + 1.4085 +#ifdef USEARCH_DEBUG 1.4086 + if (getenv("USEARCH_DEBUG") != NULL) { 1.4087 + printf("Pattern CEs\n"); 1.4088 + for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 1.4089 + printf(" %8x", strsrch->pattern.CE[ii]); 1.4090 + } 1.4091 + printf("\n"); 1.4092 + } 1.4093 + 1.4094 +#endif 1.4095 + // Input parameter sanity check. 1.4096 + // TODO: should input indicies clip to the text length 1.4097 + // in the same way that UText does. 1.4098 + if(strsrch->pattern.CELength == 0 || 1.4099 + startIdx < 0 || 1.4100 + startIdx > strsrch->search->textLength || 1.4101 + strsrch->pattern.CE == NULL) { 1.4102 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.4103 + return FALSE; 1.4104 + } 1.4105 + 1.4106 + if (strsrch->pattern.PCE == NULL) { 1.4107 + initializePatternPCETable(strsrch, status); 1.4108 + } 1.4109 + 1.4110 + CEBuffer ceb(strsrch, status); 1.4111 + int32_t targetIx = 0; 1.4112 + 1.4113 + /* 1.4114 + * Pre-load the buffer with the CE's for the grapheme 1.4115 + * after our starting position so that we're sure that 1.4116 + * we can look at the CE following the match when we 1.4117 + * check the match boundaries. 1.4118 + * 1.4119 + * This will also pre-fetch the first CE that we'll 1.4120 + * consider for the match. 1.4121 + */ 1.4122 + if (startIdx < strsrch->search->textLength) { 1.4123 + UBreakIterator *bi = strsrch->search->internalBreakIter; 1.4124 + int32_t next = ubrk_following(bi, startIdx); 1.4125 + 1.4126 + ucol_setOffset(strsrch->textIter, next, status); 1.4127 + 1.4128 + for (targetIx = 0; ; targetIx += 1) { 1.4129 + if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 1.4130 + break; 1.4131 + } 1.4132 + } 1.4133 + } else { 1.4134 + ucol_setOffset(strsrch->textIter, startIdx, status); 1.4135 + } 1.4136 + 1.4137 + 1.4138 + const CEI *targetCEI = NULL; 1.4139 + int32_t patIx; 1.4140 + UBool found; 1.4141 + 1.4142 + int32_t limitIx = targetIx; 1.4143 + int32_t mStart = -1; 1.4144 + int32_t mLimit = -1; 1.4145 + int32_t minLimit; 1.4146 + int32_t maxLimit; 1.4147 + 1.4148 + 1.4149 + 1.4150 + // Outer loop moves over match starting positions in the 1.4151 + // target CE space. 1.4152 + // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 1.4153 + // But patIx is 0 at the beginning of the pattern and increases toward the end. 1.4154 + // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 1.4155 + // and the beginning of the base text. 1.4156 + for(targetIx = limitIx; ; targetIx += 1) 1.4157 + { 1.4158 + found = TRUE; 1.4159 + // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 1.4160 + // (compared to the last CE fetched for the previous targetIx value) as we need to go 1.4161 + // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 1.4162 + const CEI *lastCEI = ceb.getPrevious(targetIx); 1.4163 + if (lastCEI == NULL) { 1.4164 + *status = U_INTERNAL_PROGRAM_ERROR; 1.4165 + found = FALSE; 1.4166 + break; 1.4167 + } 1.4168 + // Inner loop checks for a match beginning at each 1.4169 + // position from the outer loop. 1.4170 + int32_t targetIxOffset = 0; 1.4171 + for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { 1.4172 + int64_t patCE = strsrch->pattern.PCE[patIx]; 1.4173 + 1.4174 + targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset); 1.4175 + // Compare CE from target string with CE from the pattern. 1.4176 + // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 1.4177 + // which will fail the compare, below. 1.4178 + UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 1.4179 + if ( ceMatch == U_CE_NO_MATCH ) { 1.4180 + found = FALSE; 1.4181 + break; 1.4182 + } else if ( ceMatch > U_CE_NO_MATCH ) { 1.4183 + if ( ceMatch == U_CE_SKIP_TARG ) { 1.4184 + // redo with same patCE, next targCE 1.4185 + patIx++; 1.4186 + targetIxOffset++; 1.4187 + } else { // ceMatch == U_CE_SKIP_PATN 1.4188 + // redo with same targCE, next patCE 1.4189 + targetIxOffset--; 1.4190 + } 1.4191 + } 1.4192 + } 1.4193 + 1.4194 + if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 1.4195 + // No match at this targetIx. Try again at the next. 1.4196 + continue; 1.4197 + } 1.4198 + 1.4199 + if (!found) { 1.4200 + // No match at all, we have run off the end of the target text. 1.4201 + break; 1.4202 + } 1.4203 + 1.4204 + 1.4205 + // We have found a match in CE space. 1.4206 + // Now determine the bounds in string index space. 1.4207 + // There still is a chance of match failure if the CE range not correspond to 1.4208 + // an acceptable character range. 1.4209 + // 1.4210 + const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset); 1.4211 + mStart = firstCEI->lowIndex; 1.4212 + 1.4213 + // Check for the start of the match being within a combining sequence. 1.4214 + // This can happen if the pattern itself begins with a combining char, and 1.4215 + // the match found combining marks in the target text that were attached 1.4216 + // to something else. 1.4217 + // This type of match should be rejected for not completely consuming a 1.4218 + // combining sequence. 1.4219 + if (!isBreakBoundary(strsrch, mStart)) { 1.4220 + found = FALSE; 1.4221 + } 1.4222 + 1.4223 + // Look at the high index of the first CE in the match. If it's the same as the 1.4224 + // low index, the first CE in the match is in the middle of an expansion. 1.4225 + if (mStart == firstCEI->highIndex) { 1.4226 + found = FALSE; 1.4227 + } 1.4228 + 1.4229 + 1.4230 + minLimit = lastCEI->lowIndex; 1.4231 + 1.4232 + if (targetIx > 0) { 1.4233 + // Look at the CE following the match. If it is UCOL_NULLORDER the match 1.4234 + // extended to the end of input, and the match is good. 1.4235 + 1.4236 + // Look at the high and low indices of the CE following the match. If 1.4237 + // they are the same it means one of two things: 1.4238 + // 1. The match extended to the last CE from the target text, which is OK, or 1.4239 + // 2. The last CE that was part of the match is in an expansion that extends 1.4240 + // to the first CE after the match. In this case, we reject the match. 1.4241 + const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 1.4242 + 1.4243 + if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 1.4244 + found = FALSE; 1.4245 + } 1.4246 + 1.4247 + mLimit = maxLimit = nextCEI->lowIndex; 1.4248 + 1.4249 + // Advance the match end position to the first acceptable match boundary. 1.4250 + // This advances the index over any combining charcters. 1.4251 + if (minLimit < maxLimit) { 1.4252 + int32_t nba = nextBoundaryAfter(strsrch, minLimit); 1.4253 + 1.4254 + if (nba >= lastCEI->highIndex) { 1.4255 + mLimit = nba; 1.4256 + } 1.4257 + } 1.4258 + 1.4259 + // If advancing to the end of a combining sequence in character indexing space 1.4260 + // advanced us beyond the end of the match in CE space, reject this match. 1.4261 + if (mLimit > maxLimit) { 1.4262 + found = FALSE; 1.4263 + } 1.4264 + 1.4265 + // Make sure the end of the match is on a break boundary 1.4266 + if (!isBreakBoundary(strsrch, mLimit)) { 1.4267 + found = FALSE; 1.4268 + } 1.4269 + 1.4270 + } else { 1.4271 + // No non-ignorable CEs after this point. 1.4272 + // The maximum position is detected by boundary after 1.4273 + // the last non-ignorable CE. Combining sequence 1.4274 + // across the start index will be truncated. 1.4275 + int32_t nba = nextBoundaryAfter(strsrch, minLimit); 1.4276 + mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 1.4277 + } 1.4278 + 1.4279 + #ifdef USEARCH_DEBUG 1.4280 + if (getenv("USEARCH_DEBUG") != NULL) { 1.4281 + printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 1.4282 + } 1.4283 + #endif 1.4284 + 1.4285 + 1.4286 + if (! checkIdentical(strsrch, mStart, mLimit)) { 1.4287 + found = FALSE; 1.4288 + } 1.4289 + 1.4290 + if (found) { 1.4291 + break; 1.4292 + } 1.4293 + } 1.4294 + 1.4295 + #ifdef USEARCH_DEBUG 1.4296 + if (getenv("USEARCH_DEBUG") != NULL) { 1.4297 + printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 1.4298 + int32_t lastToPrint = ceb.limitIx+2; 1.4299 + for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 1.4300 + printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 1.4301 + } 1.4302 + printf("\n%s\n", found? "match found" : "no match"); 1.4303 + } 1.4304 + #endif 1.4305 + 1.4306 + // All Done. Store back the match bounds to the caller. 1.4307 + // 1.4308 + if (found==FALSE) { 1.4309 + mLimit = -1; 1.4310 + mStart = -1; 1.4311 + } 1.4312 + 1.4313 + if (matchStart != NULL) { 1.4314 + *matchStart= mStart; 1.4315 + } 1.4316 + 1.4317 + if (matchLimit != NULL) { 1.4318 + *matchLimit = mLimit; 1.4319 + } 1.4320 + 1.4321 + return found; 1.4322 +} 1.4323 + 1.4324 +// internal use methods declared in usrchimp.h ----------------------------- 1.4325 + 1.4326 +UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 1.4327 +{ 1.4328 + if (U_FAILURE(*status)) { 1.4329 + setMatchNotFound(strsrch); 1.4330 + return FALSE; 1.4331 + } 1.4332 + 1.4333 +#if BOYER_MOORE 1.4334 + UCollationElements *coleiter = strsrch->textIter; 1.4335 + int32_t textlength = strsrch->search->textLength; 1.4336 + int32_t *patternce = strsrch->pattern.CE; 1.4337 + int32_t patterncelength = strsrch->pattern.CELength; 1.4338 + int32_t textoffset = ucol_getOffset(coleiter); 1.4339 + 1.4340 + // status used in setting coleiter offset, since offset is checked in 1.4341 + // shiftForward before setting the coleiter offset, status never 1.4342 + // a failure 1.4343 + textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 1.4344 + patterncelength); 1.4345 + while (textoffset <= textlength) 1.4346 + { 1.4347 + uint32_t patternceindex = patterncelength - 1; 1.4348 + int32_t targetce; 1.4349 + UBool found = FALSE; 1.4350 + int32_t lastce = UCOL_NULLORDER; 1.4351 + 1.4352 + setColEIterOffset(coleiter, textoffset); 1.4353 + 1.4354 + for (;;) { 1.4355 + // finding the last pattern ce match, imagine composite characters 1.4356 + // for example: search for pattern A in text \u00C0 1.4357 + // we'll have to skip \u0300 the grave first before we get to A 1.4358 + targetce = ucol_previous(coleiter, status); 1.4359 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4360 + found = FALSE; 1.4361 + break; 1.4362 + } 1.4363 + targetce = getCE(strsrch, targetce); 1.4364 + if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 1.4365 + // this is for the text \u0315\u0300 that requires 1.4366 + // normalization and pattern \u0300, where \u0315 is ignorable 1.4367 + continue; 1.4368 + } 1.4369 + if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 1.4370 + lastce = targetce; 1.4371 + } 1.4372 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4373 + if (targetce == patternce[patternceindex]) { 1.4374 + // the first ce can be a contraction 1.4375 + found = TRUE; 1.4376 + break; 1.4377 + } 1.4378 + if (!hasExpansion(coleiter)) { 1.4379 + found = FALSE; 1.4380 + break; 1.4381 + } 1.4382 + } 1.4383 + 1.4384 + //targetce = lastce; 1.4385 + 1.4386 + while (found && patternceindex > 0) { 1.4387 + lastce = targetce; 1.4388 + targetce = ucol_previous(coleiter, status); 1.4389 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4390 + found = FALSE; 1.4391 + break; 1.4392 + } 1.4393 + targetce = getCE(strsrch, targetce); 1.4394 + if (targetce == UCOL_IGNORABLE) { 1.4395 + continue; 1.4396 + } 1.4397 + 1.4398 + patternceindex --; 1.4399 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4400 + found = found && targetce == patternce[patternceindex]; 1.4401 + } 1.4402 + 1.4403 + targetce = lastce; 1.4404 + 1.4405 + if (!found) { 1.4406 + if (U_FAILURE(*status)) { 1.4407 + break; 1.4408 + } 1.4409 + textoffset = shiftForward(strsrch, textoffset, lastce, 1.4410 + patternceindex); 1.4411 + // status checked at loop. 1.4412 + patternceindex = patterncelength; 1.4413 + continue; 1.4414 + } 1.4415 + 1.4416 + if (checkNextExactMatch(strsrch, &textoffset, status)) { 1.4417 + // status checked in ucol_setOffset 1.4418 + setColEIterOffset(coleiter, strsrch->search->matchedIndex); 1.4419 + return TRUE; 1.4420 + } 1.4421 + } 1.4422 + setMatchNotFound(strsrch); 1.4423 + return FALSE; 1.4424 +#else 1.4425 + int32_t textOffset = ucol_getOffset(strsrch->textIter); 1.4426 + int32_t start = -1; 1.4427 + int32_t end = -1; 1.4428 + 1.4429 + if (usearch_search(strsrch, textOffset, &start, &end, status)) { 1.4430 + strsrch->search->matchedIndex = start; 1.4431 + strsrch->search->matchedLength = end - start; 1.4432 + return TRUE; 1.4433 + } else { 1.4434 + setMatchNotFound(strsrch); 1.4435 + return FALSE; 1.4436 + } 1.4437 +#endif 1.4438 +} 1.4439 + 1.4440 +UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 1.4441 +{ 1.4442 + if (U_FAILURE(*status)) { 1.4443 + setMatchNotFound(strsrch); 1.4444 + return FALSE; 1.4445 + } 1.4446 + 1.4447 +#if BOYER_MOORE 1.4448 + UCollationElements *coleiter = strsrch->textIter; 1.4449 + int32_t textlength = strsrch->search->textLength; 1.4450 + int32_t *patternce = strsrch->pattern.CE; 1.4451 + int32_t patterncelength = strsrch->pattern.CELength; 1.4452 + int32_t textoffset = ucol_getOffset(coleiter); 1.4453 + UBool hasPatternAccents = 1.4454 + strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 1.4455 + 1.4456 + textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 1.4457 + patterncelength); 1.4458 + strsrch->canonicalPrefixAccents[0] = 0; 1.4459 + strsrch->canonicalSuffixAccents[0] = 0; 1.4460 + 1.4461 + while (textoffset <= textlength) 1.4462 + { 1.4463 + int32_t patternceindex = patterncelength - 1; 1.4464 + int32_t targetce; 1.4465 + UBool found = FALSE; 1.4466 + int32_t lastce = UCOL_NULLORDER; 1.4467 + 1.4468 + setColEIterOffset(coleiter, textoffset); 1.4469 + 1.4470 + for (;;) { 1.4471 + // finding the last pattern ce match, imagine composite characters 1.4472 + // for example: search for pattern A in text \u00C0 1.4473 + // we'll have to skip \u0300 the grave first before we get to A 1.4474 + targetce = ucol_previous(coleiter, status); 1.4475 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4476 + found = FALSE; 1.4477 + break; 1.4478 + } 1.4479 + targetce = getCE(strsrch, targetce); 1.4480 + if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 1.4481 + lastce = targetce; 1.4482 + } 1.4483 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4484 + if (targetce == patternce[patternceindex]) { 1.4485 + // the first ce can be a contraction 1.4486 + found = TRUE; 1.4487 + break; 1.4488 + } 1.4489 + if (!hasExpansion(coleiter)) { 1.4490 + found = FALSE; 1.4491 + break; 1.4492 + } 1.4493 + } 1.4494 + 1.4495 + while (found && patternceindex > 0) { 1.4496 + targetce = ucol_previous(coleiter, status); 1.4497 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4498 + found = FALSE; 1.4499 + break; 1.4500 + } 1.4501 + targetce = getCE(strsrch, targetce); 1.4502 + if (targetce == UCOL_IGNORABLE) { 1.4503 + continue; 1.4504 + } 1.4505 + 1.4506 + patternceindex --; 1.4507 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4508 + found = found && targetce == patternce[patternceindex]; 1.4509 + } 1.4510 + 1.4511 + // initializing the rearranged accent array 1.4512 + if (hasPatternAccents && !found) { 1.4513 + strsrch->canonicalPrefixAccents[0] = 0; 1.4514 + strsrch->canonicalSuffixAccents[0] = 0; 1.4515 + if (U_FAILURE(*status)) { 1.4516 + break; 1.4517 + } 1.4518 + found = doNextCanonicalMatch(strsrch, textoffset, status); 1.4519 + } 1.4520 + 1.4521 + if (!found) { 1.4522 + if (U_FAILURE(*status)) { 1.4523 + break; 1.4524 + } 1.4525 + textoffset = shiftForward(strsrch, textoffset, lastce, 1.4526 + patternceindex); 1.4527 + // status checked at loop 1.4528 + patternceindex = patterncelength; 1.4529 + continue; 1.4530 + } 1.4531 + 1.4532 + if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 1.4533 + setColEIterOffset(coleiter, strsrch->search->matchedIndex); 1.4534 + return TRUE; 1.4535 + } 1.4536 + } 1.4537 + setMatchNotFound(strsrch); 1.4538 + return FALSE; 1.4539 +#else 1.4540 + int32_t textOffset = ucol_getOffset(strsrch->textIter); 1.4541 + int32_t start = -1; 1.4542 + int32_t end = -1; 1.4543 + 1.4544 + if (usearch_search(strsrch, textOffset, &start, &end, status)) { 1.4545 + strsrch->search->matchedIndex = start; 1.4546 + strsrch->search->matchedLength = end - start; 1.4547 + return TRUE; 1.4548 + } else { 1.4549 + setMatchNotFound(strsrch); 1.4550 + return FALSE; 1.4551 + } 1.4552 +#endif 1.4553 +} 1.4554 + 1.4555 +UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 1.4556 +{ 1.4557 + if (U_FAILURE(*status)) { 1.4558 + setMatchNotFound(strsrch); 1.4559 + return FALSE; 1.4560 + } 1.4561 + 1.4562 +#if BOYER_MOORE 1.4563 + UCollationElements *coleiter = strsrch->textIter; 1.4564 + int32_t *patternce = strsrch->pattern.CE; 1.4565 + int32_t patterncelength = strsrch->pattern.CELength; 1.4566 + int32_t textoffset = ucol_getOffset(coleiter); 1.4567 + 1.4568 + // shifting it check for setting offset 1.4569 + // if setOffset is called previously or there was no previous match, we 1.4570 + // leave the offset as it is. 1.4571 + if (strsrch->search->matchedIndex != USEARCH_DONE) { 1.4572 + textoffset = strsrch->search->matchedIndex; 1.4573 + } 1.4574 + 1.4575 + textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 1.4576 + patterncelength); 1.4577 + 1.4578 + while (textoffset >= 0) 1.4579 + { 1.4580 + int32_t patternceindex = 1; 1.4581 + int32_t targetce; 1.4582 + UBool found = FALSE; 1.4583 + int32_t firstce = UCOL_NULLORDER; 1.4584 + 1.4585 + // if status is a failure, ucol_setOffset does nothing 1.4586 + setColEIterOffset(coleiter, textoffset); 1.4587 + 1.4588 + for (;;) { 1.4589 + // finding the first pattern ce match, imagine composite 1.4590 + // characters. for example: search for pattern \u0300 in text 1.4591 + // \u00C0, we'll have to skip A first before we get to 1.4592 + // \u0300 the grave accent 1.4593 + targetce = ucol_next(coleiter, status); 1.4594 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4595 + found = FALSE; 1.4596 + break; 1.4597 + } 1.4598 + targetce = getCE(strsrch, targetce); 1.4599 + if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 1.4600 + firstce = targetce; 1.4601 + } 1.4602 + if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 1.4603 + continue; 1.4604 + } 1.4605 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4606 + if (targetce == patternce[0]) { 1.4607 + found = TRUE; 1.4608 + break; 1.4609 + } 1.4610 + if (!hasExpansion(coleiter)) { 1.4611 + // checking for accents in composite character 1.4612 + found = FALSE; 1.4613 + break; 1.4614 + } 1.4615 + } 1.4616 + 1.4617 + //targetce = firstce; 1.4618 + 1.4619 + while (found && (patternceindex < patterncelength)) { 1.4620 + firstce = targetce; 1.4621 + targetce = ucol_next(coleiter, status); 1.4622 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4623 + found = FALSE; 1.4624 + break; 1.4625 + } 1.4626 + targetce = getCE(strsrch, targetce); 1.4627 + if (targetce == UCOL_IGNORABLE) { 1.4628 + continue; 1.4629 + } 1.4630 + 1.4631 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4632 + found = found && targetce == patternce[patternceindex]; 1.4633 + patternceindex ++; 1.4634 + } 1.4635 + 1.4636 + targetce = firstce; 1.4637 + 1.4638 + if (!found) { 1.4639 + if (U_FAILURE(*status)) { 1.4640 + break; 1.4641 + } 1.4642 + 1.4643 + textoffset = reverseShift(strsrch, textoffset, targetce, 1.4644 + patternceindex); 1.4645 + patternceindex = 0; 1.4646 + continue; 1.4647 + } 1.4648 + 1.4649 + if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 1.4650 + setColEIterOffset(coleiter, textoffset); 1.4651 + return TRUE; 1.4652 + } 1.4653 + } 1.4654 + setMatchNotFound(strsrch); 1.4655 + return FALSE; 1.4656 +#else 1.4657 + int32_t textOffset; 1.4658 + 1.4659 + if (strsrch->search->isOverlap) { 1.4660 + if (strsrch->search->matchedIndex != USEARCH_DONE) { 1.4661 + textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 1.4662 + } else { 1.4663 + // move the start position at the end of possible match 1.4664 + initializePatternPCETable(strsrch, status); 1.4665 + for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 1.4666 + int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); 1.4667 + if (pce == UCOL_PROCESSED_NULLORDER) { 1.4668 + // at the end of the text 1.4669 + break; 1.4670 + } 1.4671 + } 1.4672 + if (U_FAILURE(*status)) { 1.4673 + setMatchNotFound(strsrch); 1.4674 + return FALSE; 1.4675 + } 1.4676 + textOffset = ucol_getOffset(strsrch->textIter); 1.4677 + } 1.4678 + } else { 1.4679 + textOffset = ucol_getOffset(strsrch->textIter); 1.4680 + } 1.4681 + 1.4682 + int32_t start = -1; 1.4683 + int32_t end = -1; 1.4684 + 1.4685 + if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 1.4686 + strsrch->search->matchedIndex = start; 1.4687 + strsrch->search->matchedLength = end - start; 1.4688 + return TRUE; 1.4689 + } else { 1.4690 + setMatchNotFound(strsrch); 1.4691 + return FALSE; 1.4692 + } 1.4693 +#endif 1.4694 +} 1.4695 + 1.4696 +UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 1.4697 + UErrorCode *status) 1.4698 +{ 1.4699 + if (U_FAILURE(*status)) { 1.4700 + setMatchNotFound(strsrch); 1.4701 + return FALSE; 1.4702 + } 1.4703 + 1.4704 +#if BOYER_MOORE 1.4705 + UCollationElements *coleiter = strsrch->textIter; 1.4706 + int32_t *patternce = strsrch->pattern.CE; 1.4707 + int32_t patterncelength = strsrch->pattern.CELength; 1.4708 + int32_t textoffset = ucol_getOffset(coleiter); 1.4709 + UBool hasPatternAccents = 1.4710 + strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 1.4711 + 1.4712 + // shifting it check for setting offset 1.4713 + // if setOffset is called previously or there was no previous match, we 1.4714 + // leave the offset as it is. 1.4715 + if (strsrch->search->matchedIndex != USEARCH_DONE) { 1.4716 + textoffset = strsrch->search->matchedIndex; 1.4717 + } 1.4718 + 1.4719 + textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 1.4720 + patterncelength); 1.4721 + strsrch->canonicalPrefixAccents[0] = 0; 1.4722 + strsrch->canonicalSuffixAccents[0] = 0; 1.4723 + 1.4724 + while (textoffset >= 0) 1.4725 + { 1.4726 + int32_t patternceindex = 1; 1.4727 + int32_t targetce; 1.4728 + UBool found = FALSE; 1.4729 + int32_t firstce = UCOL_NULLORDER; 1.4730 + 1.4731 + setColEIterOffset(coleiter, textoffset); 1.4732 + for (;;) { 1.4733 + // finding the first pattern ce match, imagine composite 1.4734 + // characters. for example: search for pattern \u0300 in text 1.4735 + // \u00C0, we'll have to skip A first before we get to 1.4736 + // \u0300 the grave accent 1.4737 + targetce = ucol_next(coleiter, status); 1.4738 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4739 + found = FALSE; 1.4740 + break; 1.4741 + } 1.4742 + targetce = getCE(strsrch, targetce); 1.4743 + if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 1.4744 + firstce = targetce; 1.4745 + } 1.4746 + 1.4747 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4748 + if (targetce == patternce[0]) { 1.4749 + // the first ce can be a contraction 1.4750 + found = TRUE; 1.4751 + break; 1.4752 + } 1.4753 + if (!hasExpansion(coleiter)) { 1.4754 + // checking for accents in composite character 1.4755 + found = FALSE; 1.4756 + break; 1.4757 + } 1.4758 + } 1.4759 + 1.4760 + targetce = firstce; 1.4761 + 1.4762 + while (found && patternceindex < patterncelength) { 1.4763 + targetce = ucol_next(coleiter, status); 1.4764 + if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 1.4765 + found = FALSE; 1.4766 + break; 1.4767 + } 1.4768 + targetce = getCE(strsrch, targetce); 1.4769 + if (targetce == UCOL_IGNORABLE) { 1.4770 + continue; 1.4771 + } 1.4772 + 1.4773 + // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 1.4774 + found = found && targetce == patternce[patternceindex]; 1.4775 + patternceindex ++; 1.4776 + } 1.4777 + 1.4778 + // initializing the rearranged accent array 1.4779 + if (hasPatternAccents && !found) { 1.4780 + strsrch->canonicalPrefixAccents[0] = 0; 1.4781 + strsrch->canonicalSuffixAccents[0] = 0; 1.4782 + if (U_FAILURE(*status)) { 1.4783 + break; 1.4784 + } 1.4785 + found = doPreviousCanonicalMatch(strsrch, textoffset, status); 1.4786 + } 1.4787 + 1.4788 + if (!found) { 1.4789 + if (U_FAILURE(*status)) { 1.4790 + break; 1.4791 + } 1.4792 + textoffset = reverseShift(strsrch, textoffset, targetce, 1.4793 + patternceindex); 1.4794 + patternceindex = 0; 1.4795 + continue; 1.4796 + } 1.4797 + 1.4798 + if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 1.4799 + setColEIterOffset(coleiter, textoffset); 1.4800 + return TRUE; 1.4801 + } 1.4802 + } 1.4803 + setMatchNotFound(strsrch); 1.4804 + return FALSE; 1.4805 +#else 1.4806 + int32_t textOffset; 1.4807 + 1.4808 + if (strsrch->search->isOverlap) { 1.4809 + if (strsrch->search->matchedIndex != USEARCH_DONE) { 1.4810 + textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 1.4811 + } else { 1.4812 + // move the start position at the end of possible match 1.4813 + initializePatternPCETable(strsrch, status); 1.4814 + for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 1.4815 + int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); 1.4816 + if (pce == UCOL_PROCESSED_NULLORDER) { 1.4817 + // at the end of the text 1.4818 + break; 1.4819 + } 1.4820 + } 1.4821 + if (U_FAILURE(*status)) { 1.4822 + setMatchNotFound(strsrch); 1.4823 + return FALSE; 1.4824 + } 1.4825 + textOffset = ucol_getOffset(strsrch->textIter); 1.4826 + } 1.4827 + } else { 1.4828 + textOffset = ucol_getOffset(strsrch->textIter); 1.4829 + } 1.4830 + 1.4831 + int32_t start = -1; 1.4832 + int32_t end = -1; 1.4833 + 1.4834 + if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 1.4835 + strsrch->search->matchedIndex = start; 1.4836 + strsrch->search->matchedLength = end - start; 1.4837 + return TRUE; 1.4838 + } else { 1.4839 + setMatchNotFound(strsrch); 1.4840 + return FALSE; 1.4841 + } 1.4842 +#endif 1.4843 +} 1.4844 + 1.4845 +#endif /* #if !UCONFIG_NO_COLLATION */