Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2001-2011 IBM and others. All rights reserved. |
michael@0 | 4 | ********************************************************************** |
michael@0 | 5 | * Date Name Description |
michael@0 | 6 | * 07/02/2001 synwee Creation. |
michael@0 | 7 | ********************************************************************** |
michael@0 | 8 | */ |
michael@0 | 9 | |
michael@0 | 10 | #include "unicode/utypes.h" |
michael@0 | 11 | |
michael@0 | 12 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 13 | |
michael@0 | 14 | #include "unicode/usearch.h" |
michael@0 | 15 | #include "unicode/ustring.h" |
michael@0 | 16 | #include "unicode/uchar.h" |
michael@0 | 17 | #include "unicode/utf16.h" |
michael@0 | 18 | #include "normalizer2impl.h" |
michael@0 | 19 | #include "ucol_imp.h" |
michael@0 | 20 | #include "usrchimp.h" |
michael@0 | 21 | #include "cmemory.h" |
michael@0 | 22 | #include "ucln_in.h" |
michael@0 | 23 | #include "uassert.h" |
michael@0 | 24 | #include "ustr_imp.h" |
michael@0 | 25 | |
michael@0 | 26 | U_NAMESPACE_USE |
michael@0 | 27 | |
michael@0 | 28 | // don't use Boyer-Moore |
michael@0 | 29 | // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) |
michael@0 | 30 | #define BOYER_MOORE 0 |
michael@0 | 31 | |
michael@0 | 32 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 33 | |
michael@0 | 34 | // internal definition --------------------------------------------------- |
michael@0 | 35 | |
michael@0 | 36 | #define LAST_BYTE_MASK_ 0xFF |
michael@0 | 37 | #define SECOND_LAST_BYTE_SHIFT_ 8 |
michael@0 | 38 | #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 |
michael@0 | 39 | |
michael@0 | 40 | static const Normalizer2Impl *g_nfcImpl = NULL; |
michael@0 | 41 | |
michael@0 | 42 | // internal methods ------------------------------------------------- |
michael@0 | 43 | |
michael@0 | 44 | /** |
michael@0 | 45 | * Fast collation element iterator setOffset. |
michael@0 | 46 | * This function does not check for bounds. |
michael@0 | 47 | * @param coleiter collation element iterator |
michael@0 | 48 | * @param offset to set |
michael@0 | 49 | */ |
michael@0 | 50 | static |
michael@0 | 51 | inline void setColEIterOffset(UCollationElements *elems, |
michael@0 | 52 | int32_t offset) |
michael@0 | 53 | { |
michael@0 | 54 | collIterate *ci = &(elems->iteratordata_); |
michael@0 | 55 | ci->pos = ci->string + offset; |
michael@0 | 56 | ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; |
michael@0 | 57 | if (ci->flags & UCOL_ITER_INNORMBUF) { |
michael@0 | 58 | ci->flags = ci->origFlags; |
michael@0 | 59 | } |
michael@0 | 60 | ci->fcdPosition = NULL; |
michael@0 | 61 | |
michael@0 | 62 | ci->offsetReturn = NULL; |
michael@0 | 63 | ci->offsetStore = ci->offsetBuffer; |
michael@0 | 64 | ci->offsetRepeatCount = ci->offsetRepeatValue = 0; |
michael@0 | 65 | } |
michael@0 | 66 | |
michael@0 | 67 | /** |
michael@0 | 68 | * Getting the mask for collation strength |
michael@0 | 69 | * @param strength collation strength |
michael@0 | 70 | * @return collation element mask |
michael@0 | 71 | */ |
michael@0 | 72 | static |
michael@0 | 73 | inline uint32_t getMask(UCollationStrength strength) |
michael@0 | 74 | { |
michael@0 | 75 | switch (strength) |
michael@0 | 76 | { |
michael@0 | 77 | case UCOL_PRIMARY: |
michael@0 | 78 | return UCOL_PRIMARYORDERMASK; |
michael@0 | 79 | case UCOL_SECONDARY: |
michael@0 | 80 | return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; |
michael@0 | 81 | default: |
michael@0 | 82 | return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | |
michael@0 | 83 | UCOL_PRIMARYORDERMASK; |
michael@0 | 84 | } |
michael@0 | 85 | } |
michael@0 | 86 | |
michael@0 | 87 | /** |
michael@0 | 88 | * This is to squeeze the 21bit ces into a 256 table |
michael@0 | 89 | * @param ce collation element |
michael@0 | 90 | * @return collapsed version of the collation element |
michael@0 | 91 | */ |
michael@0 | 92 | static |
michael@0 | 93 | inline int hash(uint32_t ce) |
michael@0 | 94 | { |
michael@0 | 95 | // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work |
michael@0 | 96 | // well with the new collation where most of the latin 1 characters |
michael@0 | 97 | // are of the value xx000xxx. their hashes will most of the time be 0 |
michael@0 | 98 | // to be discussed on the hash algo. |
michael@0 | 99 | return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; |
michael@0 | 100 | } |
michael@0 | 101 | |
michael@0 | 102 | U_CDECL_BEGIN |
michael@0 | 103 | static UBool U_CALLCONV |
michael@0 | 104 | usearch_cleanup(void) { |
michael@0 | 105 | g_nfcImpl = NULL; |
michael@0 | 106 | return TRUE; |
michael@0 | 107 | } |
michael@0 | 108 | U_CDECL_END |
michael@0 | 109 | |
michael@0 | 110 | /** |
michael@0 | 111 | * Initializing the fcd tables. |
michael@0 | 112 | * Internal method, status assumed to be a success. |
michael@0 | 113 | * @param status output error if any, caller to check status before calling |
michael@0 | 114 | * method, status assumed to be success when passed in. |
michael@0 | 115 | */ |
michael@0 | 116 | static |
michael@0 | 117 | inline void initializeFCD(UErrorCode *status) |
michael@0 | 118 | { |
michael@0 | 119 | if (g_nfcImpl == NULL) { |
michael@0 | 120 | g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); |
michael@0 | 121 | ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); |
michael@0 | 122 | } |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | /** |
michael@0 | 126 | * Gets the fcd value for a character at the argument index. |
michael@0 | 127 | * This method takes into accounts of the supplementary characters. |
michael@0 | 128 | * @param str UTF16 string where character for fcd retrieval resides |
michael@0 | 129 | * @param offset position of the character whose fcd is to be retrieved, to be |
michael@0 | 130 | * overwritten with the next character position, taking |
michael@0 | 131 | * surrogate characters into consideration. |
michael@0 | 132 | * @param strlength length of the argument string |
michael@0 | 133 | * @return fcd value |
michael@0 | 134 | */ |
michael@0 | 135 | static |
michael@0 | 136 | uint16_t getFCD(const UChar *str, int32_t *offset, |
michael@0 | 137 | int32_t strlength) |
michael@0 | 138 | { |
michael@0 | 139 | const UChar *temp = str + *offset; |
michael@0 | 140 | uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); |
michael@0 | 141 | *offset = (int32_t)(temp - str); |
michael@0 | 142 | return result; |
michael@0 | 143 | } |
michael@0 | 144 | |
michael@0 | 145 | /** |
michael@0 | 146 | * Getting the modified collation elements taking into account the collation |
michael@0 | 147 | * attributes |
michael@0 | 148 | * @param strsrch string search data |
michael@0 | 149 | * @param sourcece |
michael@0 | 150 | * @return the modified collation element |
michael@0 | 151 | */ |
michael@0 | 152 | static |
michael@0 | 153 | inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) |
michael@0 | 154 | { |
michael@0 | 155 | // note for tertiary we can't use the collator->tertiaryMask, that |
michael@0 | 156 | // is a preprocessed mask that takes into account case options. since |
michael@0 | 157 | // we are only concerned with exact matches, we don't need that. |
michael@0 | 158 | sourcece &= strsrch->ceMask; |
michael@0 | 159 | |
michael@0 | 160 | if (strsrch->toShift) { |
michael@0 | 161 | // alternate handling here, since only the 16 most significant digits |
michael@0 | 162 | // is only used, we can safely do a compare without masking |
michael@0 | 163 | // if the ce is a variable, we mask and get only the primary values |
michael@0 | 164 | // no shifting to quartenary is required since all primary values |
michael@0 | 165 | // less than variabletop will need to be masked off anyway. |
michael@0 | 166 | if (strsrch->variableTop > sourcece) { |
michael@0 | 167 | if (strsrch->strength >= UCOL_QUATERNARY) { |
michael@0 | 168 | sourcece &= UCOL_PRIMARYORDERMASK; |
michael@0 | 169 | } |
michael@0 | 170 | else { |
michael@0 | 171 | sourcece = UCOL_IGNORABLE; |
michael@0 | 172 | } |
michael@0 | 173 | } |
michael@0 | 174 | } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { |
michael@0 | 175 | sourcece = 0xFFFF; |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | return sourcece; |
michael@0 | 179 | } |
michael@0 | 180 | |
michael@0 | 181 | /** |
michael@0 | 182 | * Allocate a memory and returns NULL if it failed. |
michael@0 | 183 | * Internal method, status assumed to be a success. |
michael@0 | 184 | * @param size to allocate |
michael@0 | 185 | * @param status output error if any, caller to check status before calling |
michael@0 | 186 | * method, status assumed to be success when passed in. |
michael@0 | 187 | * @return newly allocated array, NULL otherwise |
michael@0 | 188 | */ |
michael@0 | 189 | static |
michael@0 | 190 | inline void * allocateMemory(uint32_t size, UErrorCode *status) |
michael@0 | 191 | { |
michael@0 | 192 | uint32_t *result = (uint32_t *)uprv_malloc(size); |
michael@0 | 193 | if (result == NULL) { |
michael@0 | 194 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 195 | } |
michael@0 | 196 | return result; |
michael@0 | 197 | } |
michael@0 | 198 | |
michael@0 | 199 | /** |
michael@0 | 200 | * Adds a uint32_t value to a destination array. |
michael@0 | 201 | * Creates a new array if we run out of space. The caller will have to |
michael@0 | 202 | * manually deallocate the newly allocated array. |
michael@0 | 203 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 204 | * before calling this method. destination not to be NULL and has at least |
michael@0 | 205 | * size destinationlength. |
michael@0 | 206 | * @param destination target array |
michael@0 | 207 | * @param offset destination offset to add value |
michael@0 | 208 | * @param destinationlength target array size, return value for the new size |
michael@0 | 209 | * @param value to be added |
michael@0 | 210 | * @param increments incremental size expected |
michael@0 | 211 | * @param status output error if any, caller to check status before calling |
michael@0 | 212 | * method, status assumed to be success when passed in. |
michael@0 | 213 | * @return new destination array, destination if there was no new allocation |
michael@0 | 214 | */ |
michael@0 | 215 | static |
michael@0 | 216 | inline int32_t * addTouint32_tArray(int32_t *destination, |
michael@0 | 217 | uint32_t offset, |
michael@0 | 218 | uint32_t *destinationlength, |
michael@0 | 219 | uint32_t value, |
michael@0 | 220 | uint32_t increments, |
michael@0 | 221 | UErrorCode *status) |
michael@0 | 222 | { |
michael@0 | 223 | uint32_t newlength = *destinationlength; |
michael@0 | 224 | if (offset + 1 == newlength) { |
michael@0 | 225 | newlength += increments; |
michael@0 | 226 | int32_t *temp = (int32_t *)allocateMemory( |
michael@0 | 227 | sizeof(int32_t) * newlength, status); |
michael@0 | 228 | if (U_FAILURE(*status)) { |
michael@0 | 229 | return NULL; |
michael@0 | 230 | } |
michael@0 | 231 | uprv_memcpy(temp, destination, sizeof(int32_t) * offset); |
michael@0 | 232 | *destinationlength = newlength; |
michael@0 | 233 | destination = temp; |
michael@0 | 234 | } |
michael@0 | 235 | destination[offset] = value; |
michael@0 | 236 | return destination; |
michael@0 | 237 | } |
michael@0 | 238 | |
michael@0 | 239 | /** |
michael@0 | 240 | * Adds a uint64_t value to a destination array. |
michael@0 | 241 | * Creates a new array if we run out of space. The caller will have to |
michael@0 | 242 | * manually deallocate the newly allocated array. |
michael@0 | 243 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 244 | * before calling this method. destination not to be NULL and has at least |
michael@0 | 245 | * size destinationlength. |
michael@0 | 246 | * @param destination target array |
michael@0 | 247 | * @param offset destination offset to add value |
michael@0 | 248 | * @param destinationlength target array size, return value for the new size |
michael@0 | 249 | * @param value to be added |
michael@0 | 250 | * @param increments incremental size expected |
michael@0 | 251 | * @param status output error if any, caller to check status before calling |
michael@0 | 252 | * method, status assumed to be success when passed in. |
michael@0 | 253 | * @return new destination array, destination if there was no new allocation |
michael@0 | 254 | */ |
michael@0 | 255 | static |
michael@0 | 256 | inline int64_t * addTouint64_tArray(int64_t *destination, |
michael@0 | 257 | uint32_t offset, |
michael@0 | 258 | uint32_t *destinationlength, |
michael@0 | 259 | uint64_t value, |
michael@0 | 260 | uint32_t increments, |
michael@0 | 261 | UErrorCode *status) |
michael@0 | 262 | { |
michael@0 | 263 | uint32_t newlength = *destinationlength; |
michael@0 | 264 | if (offset + 1 == newlength) { |
michael@0 | 265 | newlength += increments; |
michael@0 | 266 | int64_t *temp = (int64_t *)allocateMemory( |
michael@0 | 267 | sizeof(int64_t) * newlength, status); |
michael@0 | 268 | |
michael@0 | 269 | if (U_FAILURE(*status)) { |
michael@0 | 270 | return NULL; |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | uprv_memcpy(temp, destination, sizeof(int64_t) * offset); |
michael@0 | 274 | *destinationlength = newlength; |
michael@0 | 275 | destination = temp; |
michael@0 | 276 | } |
michael@0 | 277 | |
michael@0 | 278 | destination[offset] = value; |
michael@0 | 279 | |
michael@0 | 280 | return destination; |
michael@0 | 281 | } |
michael@0 | 282 | |
michael@0 | 283 | /** |
michael@0 | 284 | * Initializing the ce table for a pattern. |
michael@0 | 285 | * Stores non-ignorable collation keys. |
michael@0 | 286 | * Table size will be estimated by the size of the pattern text. Table |
michael@0 | 287 | * expansion will be perform as we go along. Adding 1 to ensure that the table |
michael@0 | 288 | * size definitely increases. |
michael@0 | 289 | * Internal method, status assumed to be a success. |
michael@0 | 290 | * @param strsrch string search data |
michael@0 | 291 | * @param status output error if any, caller to check status before calling |
michael@0 | 292 | * method, status assumed to be success when passed in. |
michael@0 | 293 | * @return total number of expansions |
michael@0 | 294 | */ |
michael@0 | 295 | static |
michael@0 | 296 | inline uint16_t initializePatternCETable(UStringSearch *strsrch, |
michael@0 | 297 | UErrorCode *status) |
michael@0 | 298 | { |
michael@0 | 299 | UPattern *pattern = &(strsrch->pattern); |
michael@0 | 300 | uint32_t cetablesize = INITIAL_ARRAY_SIZE_; |
michael@0 | 301 | int32_t *cetable = pattern->CEBuffer; |
michael@0 | 302 | uint32_t patternlength = pattern->textLength; |
michael@0 | 303 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 304 | |
michael@0 | 305 | if (coleiter == NULL) { |
michael@0 | 306 | coleiter = ucol_openElements(strsrch->collator, pattern->text, |
michael@0 | 307 | patternlength, status); |
michael@0 | 308 | // status will be checked in ucol_next(..) later and if it is an |
michael@0 | 309 | // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be |
michael@0 | 310 | // returned. |
michael@0 | 311 | strsrch->utilIter = coleiter; |
michael@0 | 312 | } |
michael@0 | 313 | else { |
michael@0 | 314 | uprv_init_collIterate(strsrch->collator, pattern->text, |
michael@0 | 315 | pattern->textLength, |
michael@0 | 316 | &coleiter->iteratordata_, |
michael@0 | 317 | status); |
michael@0 | 318 | } |
michael@0 | 319 | if(U_FAILURE(*status)) { |
michael@0 | 320 | return 0; |
michael@0 | 321 | } |
michael@0 | 322 | |
michael@0 | 323 | if (pattern->CE != cetable && pattern->CE) { |
michael@0 | 324 | uprv_free(pattern->CE); |
michael@0 | 325 | } |
michael@0 | 326 | |
michael@0 | 327 | uint16_t offset = 0; |
michael@0 | 328 | uint16_t result = 0; |
michael@0 | 329 | int32_t ce; |
michael@0 | 330 | |
michael@0 | 331 | while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && |
michael@0 | 332 | U_SUCCESS(*status)) { |
michael@0 | 333 | uint32_t newce = getCE(strsrch, ce); |
michael@0 | 334 | if (newce) { |
michael@0 | 335 | int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, |
michael@0 | 336 | newce, |
michael@0 | 337 | patternlength - ucol_getOffset(coleiter) + 1, |
michael@0 | 338 | status); |
michael@0 | 339 | if (U_FAILURE(*status)) { |
michael@0 | 340 | return 0; |
michael@0 | 341 | } |
michael@0 | 342 | offset ++; |
michael@0 | 343 | if (cetable != temp && cetable != pattern->CEBuffer) { |
michael@0 | 344 | uprv_free(cetable); |
michael@0 | 345 | } |
michael@0 | 346 | cetable = temp; |
michael@0 | 347 | } |
michael@0 | 348 | result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); |
michael@0 | 349 | } |
michael@0 | 350 | |
michael@0 | 351 | cetable[offset] = 0; |
michael@0 | 352 | pattern->CE = cetable; |
michael@0 | 353 | pattern->CELength = offset; |
michael@0 | 354 | |
michael@0 | 355 | return result; |
michael@0 | 356 | } |
michael@0 | 357 | |
michael@0 | 358 | /** |
michael@0 | 359 | * Initializing the pce table for a pattern. |
michael@0 | 360 | * Stores non-ignorable collation keys. |
michael@0 | 361 | * Table size will be estimated by the size of the pattern text. Table |
michael@0 | 362 | * expansion will be perform as we go along. Adding 1 to ensure that the table |
michael@0 | 363 | * size definitely increases. |
michael@0 | 364 | * Internal method, status assumed to be a success. |
michael@0 | 365 | * @param strsrch string search data |
michael@0 | 366 | * @param status output error if any, caller to check status before calling |
michael@0 | 367 | * method, status assumed to be success when passed in. |
michael@0 | 368 | * @return total number of expansions |
michael@0 | 369 | */ |
michael@0 | 370 | static |
michael@0 | 371 | inline uint16_t initializePatternPCETable(UStringSearch *strsrch, |
michael@0 | 372 | UErrorCode *status) |
michael@0 | 373 | { |
michael@0 | 374 | UPattern *pattern = &(strsrch->pattern); |
michael@0 | 375 | uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; |
michael@0 | 376 | int64_t *pcetable = pattern->PCEBuffer; |
michael@0 | 377 | uint32_t patternlength = pattern->textLength; |
michael@0 | 378 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 379 | |
michael@0 | 380 | if (coleiter == NULL) { |
michael@0 | 381 | coleiter = ucol_openElements(strsrch->collator, pattern->text, |
michael@0 | 382 | patternlength, status); |
michael@0 | 383 | // status will be checked in ucol_next(..) later and if it is an |
michael@0 | 384 | // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be |
michael@0 | 385 | // returned. |
michael@0 | 386 | strsrch->utilIter = coleiter; |
michael@0 | 387 | } else { |
michael@0 | 388 | uprv_init_collIterate(strsrch->collator, pattern->text, |
michael@0 | 389 | pattern->textLength, |
michael@0 | 390 | &coleiter->iteratordata_, |
michael@0 | 391 | status); |
michael@0 | 392 | } |
michael@0 | 393 | if(U_FAILURE(*status)) { |
michael@0 | 394 | return 0; |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | if (pattern->PCE != pcetable && pattern->PCE != NULL) { |
michael@0 | 398 | uprv_free(pattern->PCE); |
michael@0 | 399 | } |
michael@0 | 400 | |
michael@0 | 401 | uint16_t offset = 0; |
michael@0 | 402 | uint16_t result = 0; |
michael@0 | 403 | int64_t pce; |
michael@0 | 404 | |
michael@0 | 405 | uprv_init_pce(coleiter); |
michael@0 | 406 | |
michael@0 | 407 | // ** Should processed CEs be signed or unsigned? |
michael@0 | 408 | // ** (the rest of the code in this file seems to play fast-and-loose with |
michael@0 | 409 | // ** whether a CE is signed or unsigned. For example, look at routine above this one.) |
michael@0 | 410 | while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && |
michael@0 | 411 | U_SUCCESS(*status)) { |
michael@0 | 412 | int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, |
michael@0 | 413 | pce, |
michael@0 | 414 | patternlength - ucol_getOffset(coleiter) + 1, |
michael@0 | 415 | status); |
michael@0 | 416 | |
michael@0 | 417 | if (U_FAILURE(*status)) { |
michael@0 | 418 | return 0; |
michael@0 | 419 | } |
michael@0 | 420 | |
michael@0 | 421 | offset += 1; |
michael@0 | 422 | |
michael@0 | 423 | if (pcetable != temp && pcetable != pattern->PCEBuffer) { |
michael@0 | 424 | uprv_free(pcetable); |
michael@0 | 425 | } |
michael@0 | 426 | |
michael@0 | 427 | pcetable = temp; |
michael@0 | 428 | //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); |
michael@0 | 429 | } |
michael@0 | 430 | |
michael@0 | 431 | pcetable[offset] = 0; |
michael@0 | 432 | pattern->PCE = pcetable; |
michael@0 | 433 | pattern->PCELength = offset; |
michael@0 | 434 | |
michael@0 | 435 | return result; |
michael@0 | 436 | } |
michael@0 | 437 | |
michael@0 | 438 | /** |
michael@0 | 439 | * Initializes the pattern struct. |
michael@0 | 440 | * Internal method, status assumed to be success. |
michael@0 | 441 | * @param strsrch UStringSearch data storage |
michael@0 | 442 | * @param status output error if any, caller to check status before calling |
michael@0 | 443 | * method, status assumed to be success when passed in. |
michael@0 | 444 | * @return expansionsize the total expansion size of the pattern |
michael@0 | 445 | */ |
michael@0 | 446 | static |
michael@0 | 447 | inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) |
michael@0 | 448 | { |
michael@0 | 449 | UPattern *pattern = &(strsrch->pattern); |
michael@0 | 450 | const UChar *patterntext = pattern->text; |
michael@0 | 451 | int32_t length = pattern->textLength; |
michael@0 | 452 | int32_t index = 0; |
michael@0 | 453 | |
michael@0 | 454 | // Since the strength is primary, accents are ignored in the pattern. |
michael@0 | 455 | if (strsrch->strength == UCOL_PRIMARY) { |
michael@0 | 456 | pattern->hasPrefixAccents = 0; |
michael@0 | 457 | pattern->hasSuffixAccents = 0; |
michael@0 | 458 | } else { |
michael@0 | 459 | pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> |
michael@0 | 460 | SECOND_LAST_BYTE_SHIFT_; |
michael@0 | 461 | index = length; |
michael@0 | 462 | U16_BACK_1(patterntext, 0, index); |
michael@0 | 463 | pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & |
michael@0 | 464 | LAST_BYTE_MASK_; |
michael@0 | 465 | } |
michael@0 | 466 | |
michael@0 | 467 | // ** HACK ** |
michael@0 | 468 | if (strsrch->pattern.PCE != NULL) { |
michael@0 | 469 | if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { |
michael@0 | 470 | uprv_free(strsrch->pattern.PCE); |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | strsrch->pattern.PCE = NULL; |
michael@0 | 474 | } |
michael@0 | 475 | |
michael@0 | 476 | // since intializePattern is an internal method status is a success. |
michael@0 | 477 | return initializePatternCETable(strsrch, status); |
michael@0 | 478 | } |
michael@0 | 479 | |
michael@0 | 480 | /** |
michael@0 | 481 | * Initializing shift tables, with the default values. |
michael@0 | 482 | * If a corresponding default value is 0, the shift table is not set. |
michael@0 | 483 | * @param shift table for forwards shift |
michael@0 | 484 | * @param backshift table for backwards shift |
michael@0 | 485 | * @param cetable table containing pattern ce |
michael@0 | 486 | * @param cesize size of the pattern ces |
michael@0 | 487 | * @param expansionsize total size of the expansions |
michael@0 | 488 | * @param defaultforward the default forward value |
michael@0 | 489 | * @param defaultbackward the default backward value |
michael@0 | 490 | */ |
michael@0 | 491 | static |
michael@0 | 492 | inline void setShiftTable(int16_t shift[], int16_t backshift[], |
michael@0 | 493 | int32_t *cetable, int32_t cesize, |
michael@0 | 494 | int16_t expansionsize, |
michael@0 | 495 | int16_t defaultforward, |
michael@0 | 496 | int16_t defaultbackward) |
michael@0 | 497 | { |
michael@0 | 498 | // estimate the value to shift. to do that we estimate the smallest |
michael@0 | 499 | // number of characters to give the relevant ces, ie approximately |
michael@0 | 500 | // the number of ces minus their expansion, since expansions can come |
michael@0 | 501 | // from a character. |
michael@0 | 502 | int32_t count; |
michael@0 | 503 | for (count = 0; count < MAX_TABLE_SIZE_; count ++) { |
michael@0 | 504 | shift[count] = defaultforward; |
michael@0 | 505 | } |
michael@0 | 506 | cesize --; // down to the last index |
michael@0 | 507 | for (count = 0; count < cesize; count ++) { |
michael@0 | 508 | // number of ces from right of array to the count |
michael@0 | 509 | int temp = defaultforward - count - 1; |
michael@0 | 510 | shift[hash(cetable[count])] = temp > 1 ? temp : 1; |
michael@0 | 511 | } |
michael@0 | 512 | shift[hash(cetable[cesize])] = 1; |
michael@0 | 513 | // for ignorables we just shift by one. see test examples. |
michael@0 | 514 | shift[hash(0)] = 1; |
michael@0 | 515 | |
michael@0 | 516 | for (count = 0; count < MAX_TABLE_SIZE_; count ++) { |
michael@0 | 517 | backshift[count] = defaultbackward; |
michael@0 | 518 | } |
michael@0 | 519 | for (count = cesize; count > 0; count --) { |
michael@0 | 520 | // the original value count does not seem to work |
michael@0 | 521 | backshift[hash(cetable[count])] = count > expansionsize ? |
michael@0 | 522 | (int16_t)(count - expansionsize) : 1; |
michael@0 | 523 | } |
michael@0 | 524 | backshift[hash(cetable[0])] = 1; |
michael@0 | 525 | backshift[hash(0)] = 1; |
michael@0 | 526 | } |
michael@0 | 527 | |
michael@0 | 528 | /** |
michael@0 | 529 | * Building of the pattern collation element list and the boyer moore strsrch |
michael@0 | 530 | * table. |
michael@0 | 531 | * The canonical match will only be performed after the default match fails. |
michael@0 | 532 | * For both cases we need to remember the size of the composed and decomposed |
michael@0 | 533 | * versions of the string. Since the Boyer-Moore shift calculations shifts by |
michael@0 | 534 | * a number of characters in the text and tries to match the pattern from that |
michael@0 | 535 | * offset, the shift value can not be too large in case we miss some |
michael@0 | 536 | * characters. To choose a right shift size, we estimate the NFC form of the |
michael@0 | 537 | * and use its size as a shift guide. The NFC form should be the small |
michael@0 | 538 | * possible representation of the pattern. Anyways, we'll err on the smaller |
michael@0 | 539 | * shift size. Hence the calculation for minlength. |
michael@0 | 540 | * Canonical match will be performed slightly differently. We'll split the |
michael@0 | 541 | * pattern into 3 parts, the prefix accents (PA), the middle string bounded by |
michael@0 | 542 | * the first and last base character (MS), the ending accents (EA). Matches |
michael@0 | 543 | * will be done on MS first, and only when we match MS then some processing |
michael@0 | 544 | * will be required for the prefix and end accents in order to determine if |
michael@0 | 545 | * they match PA and EA. Hence the default shift values |
michael@0 | 546 | * for the canonical match will take the size of either end's accent into |
michael@0 | 547 | * consideration. Forwards search will take the end accents into consideration |
michael@0 | 548 | * for the default shift values and the backwards search will take the prefix |
michael@0 | 549 | * accents into consideration. |
michael@0 | 550 | * If pattern has no non-ignorable ce, we return a illegal argument error. |
michael@0 | 551 | * Internal method, status assumed to be success. |
michael@0 | 552 | * @param strsrch UStringSearch data storage |
michael@0 | 553 | * @param status for output errors if it occurs, status is assumed to be a |
michael@0 | 554 | * success when it is passed in. |
michael@0 | 555 | */ |
michael@0 | 556 | static |
michael@0 | 557 | inline void initialize(UStringSearch *strsrch, UErrorCode *status) |
michael@0 | 558 | { |
michael@0 | 559 | int16_t expandlength = initializePattern(strsrch, status); |
michael@0 | 560 | if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { |
michael@0 | 561 | UPattern *pattern = &strsrch->pattern; |
michael@0 | 562 | int32_t cesize = pattern->CELength; |
michael@0 | 563 | |
michael@0 | 564 | int16_t minlength = cesize > expandlength |
michael@0 | 565 | ? (int16_t)cesize - expandlength : 1; |
michael@0 | 566 | pattern->defaultShiftSize = minlength; |
michael@0 | 567 | setShiftTable(pattern->shift, pattern->backShift, pattern->CE, |
michael@0 | 568 | cesize, expandlength, minlength, minlength); |
michael@0 | 569 | return; |
michael@0 | 570 | } |
michael@0 | 571 | strsrch->pattern.defaultShiftSize = 0; |
michael@0 | 572 | } |
michael@0 | 573 | |
michael@0 | 574 | #if BOYER_MOORE |
michael@0 | 575 | /** |
michael@0 | 576 | * Check to make sure that the match length is at the end of the character by |
michael@0 | 577 | * using the breakiterator. |
michael@0 | 578 | * @param strsrch string search data |
michael@0 | 579 | * @param start target text start offset |
michael@0 | 580 | * @param end target text end offset |
michael@0 | 581 | */ |
michael@0 | 582 | static |
michael@0 | 583 | void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, |
michael@0 | 584 | int32_t *end) |
michael@0 | 585 | { |
michael@0 | 586 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 587 | UBreakIterator *breakiterator = strsrch->search->internalBreakIter; |
michael@0 | 588 | if (breakiterator) { |
michael@0 | 589 | int32_t matchend = *end; |
michael@0 | 590 | //int32_t matchstart = *start; |
michael@0 | 591 | |
michael@0 | 592 | if (!ubrk_isBoundary(breakiterator, matchend)) { |
michael@0 | 593 | *end = ubrk_following(breakiterator, matchend); |
michael@0 | 594 | } |
michael@0 | 595 | |
michael@0 | 596 | /* Check the start of the matched text to make sure it doesn't have any accents |
michael@0 | 597 | * before it. This code may not be necessary and so it is commented out */ |
michael@0 | 598 | /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { |
michael@0 | 599 | *start = ubrk_preceding(breakiterator, matchstart); |
michael@0 | 600 | }*/ |
michael@0 | 601 | } |
michael@0 | 602 | #endif |
michael@0 | 603 | } |
michael@0 | 604 | |
michael@0 | 605 | /** |
michael@0 | 606 | * Determine whether the target text in UStringSearch bounded by the offset |
michael@0 | 607 | * start and end is one or more whole units of text as |
michael@0 | 608 | * determined by the breakiterator in UStringSearch. |
michael@0 | 609 | * @param strsrch string search data |
michael@0 | 610 | * @param start target text start offset |
michael@0 | 611 | * @param end target text end offset |
michael@0 | 612 | */ |
michael@0 | 613 | static |
michael@0 | 614 | UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, |
michael@0 | 615 | int32_t end) |
michael@0 | 616 | { |
michael@0 | 617 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 618 | UBreakIterator *breakiterator = strsrch->search->breakIter; |
michael@0 | 619 | //TODO: Add here. |
michael@0 | 620 | if (breakiterator) { |
michael@0 | 621 | int32_t startindex = ubrk_first(breakiterator); |
michael@0 | 622 | int32_t endindex = ubrk_last(breakiterator); |
michael@0 | 623 | |
michael@0 | 624 | // out-of-range indexes are never boundary positions |
michael@0 | 625 | if (start < startindex || start > endindex || |
michael@0 | 626 | end < startindex || end > endindex) { |
michael@0 | 627 | return FALSE; |
michael@0 | 628 | } |
michael@0 | 629 | // otherwise, we can use following() on the position before the |
michael@0 | 630 | // specified one and return true of the position we get back is the |
michael@0 | 631 | // one the user specified |
michael@0 | 632 | UBool result = (start == startindex || |
michael@0 | 633 | ubrk_following(breakiterator, start - 1) == start) && |
michael@0 | 634 | (end == endindex || |
michael@0 | 635 | ubrk_following(breakiterator, end - 1) == end); |
michael@0 | 636 | if (result) { |
michael@0 | 637 | // iterates the individual ces |
michael@0 | 638 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 639 | const UChar *text = strsrch->search->text + |
michael@0 | 640 | start; |
michael@0 | 641 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 642 | ucol_setText(coleiter, text, end - start, &status); |
michael@0 | 643 | for (int32_t count = 0; count < strsrch->pattern.CELength; |
michael@0 | 644 | count ++) { |
michael@0 | 645 | int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); |
michael@0 | 646 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 647 | count --; |
michael@0 | 648 | continue; |
michael@0 | 649 | } |
michael@0 | 650 | if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { |
michael@0 | 651 | return FALSE; |
michael@0 | 652 | } |
michael@0 | 653 | } |
michael@0 | 654 | int32_t nextce = ucol_next(coleiter, &status); |
michael@0 | 655 | while (ucol_getOffset(coleiter) == (end - start) |
michael@0 | 656 | && getCE(strsrch, nextce) == UCOL_IGNORABLE) { |
michael@0 | 657 | nextce = ucol_next(coleiter, &status); |
michael@0 | 658 | } |
michael@0 | 659 | if (ucol_getOffset(coleiter) == (end - start) |
michael@0 | 660 | && nextce != UCOL_NULLORDER) { |
michael@0 | 661 | // extra collation elements at the end of the match |
michael@0 | 662 | return FALSE; |
michael@0 | 663 | } |
michael@0 | 664 | } |
michael@0 | 665 | return result; |
michael@0 | 666 | } |
michael@0 | 667 | #endif |
michael@0 | 668 | return TRUE; |
michael@0 | 669 | } |
michael@0 | 670 | |
michael@0 | 671 | /** |
michael@0 | 672 | * Getting the next base character offset if current offset is an accent, |
michael@0 | 673 | * or the current offset if the current character contains a base character. |
michael@0 | 674 | * accents the following base character will be returned |
michael@0 | 675 | * @param text string |
michael@0 | 676 | * @param textoffset current offset |
michael@0 | 677 | * @param textlength length of text string |
michael@0 | 678 | * @return the next base character or the current offset |
michael@0 | 679 | * if the current character is contains a base character. |
michael@0 | 680 | */ |
michael@0 | 681 | static |
michael@0 | 682 | inline int32_t getNextBaseOffset(const UChar *text, |
michael@0 | 683 | int32_t textoffset, |
michael@0 | 684 | int32_t textlength) |
michael@0 | 685 | { |
michael@0 | 686 | if (textoffset < textlength) { |
michael@0 | 687 | int32_t temp = textoffset; |
michael@0 | 688 | if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { |
michael@0 | 689 | while (temp < textlength) { |
michael@0 | 690 | int32_t result = temp; |
michael@0 | 691 | if ((getFCD(text, &temp, textlength) >> |
michael@0 | 692 | SECOND_LAST_BYTE_SHIFT_) == 0) { |
michael@0 | 693 | return result; |
michael@0 | 694 | } |
michael@0 | 695 | } |
michael@0 | 696 | return textlength; |
michael@0 | 697 | } |
michael@0 | 698 | } |
michael@0 | 699 | return textoffset; |
michael@0 | 700 | } |
michael@0 | 701 | |
michael@0 | 702 | /** |
michael@0 | 703 | * Gets the next base character offset depending on the string search pattern |
michael@0 | 704 | * data |
michael@0 | 705 | * @param strsrch string search data |
michael@0 | 706 | * @param textoffset current offset, one offset away from the last character |
michael@0 | 707 | * to search for. |
michael@0 | 708 | * @return start index of the next base character or the current offset |
michael@0 | 709 | * if the current character is contains a base character. |
michael@0 | 710 | */ |
michael@0 | 711 | static |
michael@0 | 712 | inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, |
michael@0 | 713 | int32_t textoffset) |
michael@0 | 714 | { |
michael@0 | 715 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 716 | if (strsrch->pattern.hasSuffixAccents && |
michael@0 | 717 | textoffset < textlength) { |
michael@0 | 718 | int32_t temp = textoffset; |
michael@0 | 719 | const UChar *text = strsrch->search->text; |
michael@0 | 720 | U16_BACK_1(text, 0, temp); |
michael@0 | 721 | if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { |
michael@0 | 722 | return getNextBaseOffset(text, textoffset, textlength); |
michael@0 | 723 | } |
michael@0 | 724 | } |
michael@0 | 725 | return textoffset; |
michael@0 | 726 | } |
michael@0 | 727 | |
michael@0 | 728 | /** |
michael@0 | 729 | * Shifting the collation element iterator position forward to prepare for |
michael@0 | 730 | * a following match. If the last character is a unsafe character, we'll only |
michael@0 | 731 | * shift by 1 to capture contractions, normalization etc. |
michael@0 | 732 | * Internal method, status assumed to be success. |
michael@0 | 733 | * @param text strsrch string search data |
michael@0 | 734 | * @param textoffset start text position to do search |
michael@0 | 735 | * @param ce the text ce which failed the match. |
michael@0 | 736 | * @param patternceindex index of the ce within the pattern ce buffer which |
michael@0 | 737 | * failed the match |
michael@0 | 738 | * @return final offset |
michael@0 | 739 | */ |
michael@0 | 740 | static |
michael@0 | 741 | inline int32_t shiftForward(UStringSearch *strsrch, |
michael@0 | 742 | int32_t textoffset, |
michael@0 | 743 | int32_t ce, |
michael@0 | 744 | int32_t patternceindex) |
michael@0 | 745 | { |
michael@0 | 746 | UPattern *pattern = &(strsrch->pattern); |
michael@0 | 747 | if (ce != UCOL_NULLORDER) { |
michael@0 | 748 | int32_t shift = pattern->shift[hash(ce)]; |
michael@0 | 749 | // this is to adjust for characters in the middle of the |
michael@0 | 750 | // substring for matching that failed. |
michael@0 | 751 | int32_t adjust = pattern->CELength - patternceindex; |
michael@0 | 752 | if (adjust > 1 && shift >= adjust) { |
michael@0 | 753 | shift -= adjust - 1; |
michael@0 | 754 | } |
michael@0 | 755 | textoffset += shift; |
michael@0 | 756 | } |
michael@0 | 757 | else { |
michael@0 | 758 | textoffset += pattern->defaultShiftSize; |
michael@0 | 759 | } |
michael@0 | 760 | |
michael@0 | 761 | textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); |
michael@0 | 762 | // check for unsafe characters |
michael@0 | 763 | // * if it is the start or middle of a contraction: to be done after |
michael@0 | 764 | // a initial match is found |
michael@0 | 765 | // * thai or lao base consonant character: similar to contraction |
michael@0 | 766 | // * high surrogate character: similar to contraction |
michael@0 | 767 | // * next character is a accent: shift to the next base character |
michael@0 | 768 | return textoffset; |
michael@0 | 769 | } |
michael@0 | 770 | #endif // #if BOYER_MOORE |
michael@0 | 771 | |
michael@0 | 772 | /** |
michael@0 | 773 | * sets match not found |
michael@0 | 774 | * @param strsrch string search data |
michael@0 | 775 | */ |
michael@0 | 776 | static |
michael@0 | 777 | inline void setMatchNotFound(UStringSearch *strsrch) |
michael@0 | 778 | { |
michael@0 | 779 | // this method resets the match result regardless of the error status. |
michael@0 | 780 | strsrch->search->matchedIndex = USEARCH_DONE; |
michael@0 | 781 | strsrch->search->matchedLength = 0; |
michael@0 | 782 | if (strsrch->search->isForwardSearching) { |
michael@0 | 783 | setColEIterOffset(strsrch->textIter, strsrch->search->textLength); |
michael@0 | 784 | } |
michael@0 | 785 | else { |
michael@0 | 786 | setColEIterOffset(strsrch->textIter, 0); |
michael@0 | 787 | } |
michael@0 | 788 | } |
michael@0 | 789 | |
michael@0 | 790 | #if BOYER_MOORE |
michael@0 | 791 | /** |
michael@0 | 792 | * Gets the offset to the next safe point in text. |
michael@0 | 793 | * ie. not the middle of a contraction, swappable characters or supplementary |
michael@0 | 794 | * characters. |
michael@0 | 795 | * @param collator collation sata |
michael@0 | 796 | * @param text string to work with |
michael@0 | 797 | * @param textoffset offset in string |
michael@0 | 798 | * @param textlength length of text string |
michael@0 | 799 | * @return offset to the next safe character |
michael@0 | 800 | */ |
michael@0 | 801 | static |
michael@0 | 802 | inline int32_t getNextSafeOffset(const UCollator *collator, |
michael@0 | 803 | const UChar *text, |
michael@0 | 804 | int32_t textoffset, |
michael@0 | 805 | int32_t textlength) |
michael@0 | 806 | { |
michael@0 | 807 | int32_t result = textoffset; // first contraction character |
michael@0 | 808 | while (result != textlength && ucol_unsafeCP(text[result], collator)) { |
michael@0 | 809 | result ++; |
michael@0 | 810 | } |
michael@0 | 811 | return result; |
michael@0 | 812 | } |
michael@0 | 813 | |
michael@0 | 814 | /** |
michael@0 | 815 | * This checks for accents in the potential match started with a . |
michael@0 | 816 | * composite character. |
michael@0 | 817 | * This is really painful... we have to check that composite character do not |
michael@0 | 818 | * have any extra accents. We have to normalize the potential match and find |
michael@0 | 819 | * the immediate decomposed character before the match. |
michael@0 | 820 | * The first composite character would have been taken care of by the fcd |
michael@0 | 821 | * checks in checkForwardExactMatch. |
michael@0 | 822 | * This is the slow path after the fcd of the first character and |
michael@0 | 823 | * the last character has been checked by checkForwardExactMatch and we |
michael@0 | 824 | * determine that the potential match has extra non-ignorable preceding |
michael@0 | 825 | * ces. |
michael@0 | 826 | * E.g. looking for \u0301 acute in \u01FA A ring above and acute, |
michael@0 | 827 | * checkExtraMatchAccent should fail since there is a middle ring in \u01FA |
michael@0 | 828 | * Note here that accents checking are slow and cautioned in the API docs. |
michael@0 | 829 | * Internal method, status assumed to be a success, caller should check status |
michael@0 | 830 | * before calling this method |
michael@0 | 831 | * @param strsrch string search data |
michael@0 | 832 | * @param start index of the potential unfriendly composite character |
michael@0 | 833 | * @param end index of the potential unfriendly composite character |
michael@0 | 834 | * @param status output error status if any. |
michael@0 | 835 | * @return TRUE if there is non-ignorable accents before at the beginning |
michael@0 | 836 | * of the match, FALSE otherwise. |
michael@0 | 837 | */ |
michael@0 | 838 | |
michael@0 | 839 | static |
michael@0 | 840 | UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, |
michael@0 | 841 | int32_t end, |
michael@0 | 842 | UErrorCode *status) |
michael@0 | 843 | { |
michael@0 | 844 | UBool result = FALSE; |
michael@0 | 845 | if (strsrch->pattern.hasPrefixAccents) { |
michael@0 | 846 | int32_t length = end - start; |
michael@0 | 847 | int32_t offset = 0; |
michael@0 | 848 | const UChar *text = strsrch->search->text + start; |
michael@0 | 849 | |
michael@0 | 850 | U16_FWD_1(text, offset, length); |
michael@0 | 851 | // we are only concerned with the first composite character |
michael@0 | 852 | if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { |
michael@0 | 853 | int32_t safeoffset = getNextSafeOffset(strsrch->collator, |
michael@0 | 854 | text, 0, length); |
michael@0 | 855 | if (safeoffset != length) { |
michael@0 | 856 | safeoffset ++; |
michael@0 | 857 | } |
michael@0 | 858 | UChar *norm = NULL; |
michael@0 | 859 | UChar buffer[INITIAL_ARRAY_SIZE_]; |
michael@0 | 860 | int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, |
michael@0 | 861 | buffer, INITIAL_ARRAY_SIZE_, |
michael@0 | 862 | status); |
michael@0 | 863 | if (U_FAILURE(*status)) { |
michael@0 | 864 | return FALSE; |
michael@0 | 865 | } |
michael@0 | 866 | if (size >= INITIAL_ARRAY_SIZE_) { |
michael@0 | 867 | norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), |
michael@0 | 868 | status); |
michael@0 | 869 | // if allocation failed, status will be set to |
michael@0 | 870 | // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally |
michael@0 | 871 | // checks for it. |
michael@0 | 872 | size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, |
michael@0 | 873 | size, status); |
michael@0 | 874 | if (U_FAILURE(*status) && norm != NULL) { |
michael@0 | 875 | uprv_free(norm); |
michael@0 | 876 | return FALSE; |
michael@0 | 877 | } |
michael@0 | 878 | } |
michael@0 | 879 | else { |
michael@0 | 880 | norm = buffer; |
michael@0 | 881 | } |
michael@0 | 882 | |
michael@0 | 883 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 884 | ucol_setText(coleiter, norm, size, status); |
michael@0 | 885 | uint32_t firstce = strsrch->pattern.CE[0]; |
michael@0 | 886 | UBool ignorable = TRUE; |
michael@0 | 887 | uint32_t ce = UCOL_IGNORABLE; |
michael@0 | 888 | while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { |
michael@0 | 889 | offset = ucol_getOffset(coleiter); |
michael@0 | 890 | if (ce != firstce && ce != UCOL_IGNORABLE) { |
michael@0 | 891 | ignorable = FALSE; |
michael@0 | 892 | } |
michael@0 | 893 | ce = ucol_next(coleiter, status); |
michael@0 | 894 | } |
michael@0 | 895 | UChar32 codepoint; |
michael@0 | 896 | U16_PREV(norm, 0, offset, codepoint); |
michael@0 | 897 | result = !ignorable && (u_getCombiningClass(codepoint) != 0); |
michael@0 | 898 | |
michael@0 | 899 | if (norm != buffer) { |
michael@0 | 900 | uprv_free(norm); |
michael@0 | 901 | } |
michael@0 | 902 | } |
michael@0 | 903 | } |
michael@0 | 904 | |
michael@0 | 905 | return result; |
michael@0 | 906 | } |
michael@0 | 907 | |
michael@0 | 908 | /** |
michael@0 | 909 | * Used by exact matches, checks if there are accents before the match. |
michael@0 | 910 | * This is really painful... we have to check that composite characters at |
michael@0 | 911 | * the start of the matches have to not have any extra accents. |
michael@0 | 912 | * We check the FCD of the character first, if it starts with an accent and |
michael@0 | 913 | * the first pattern ce does not match the first ce of the character, we bail. |
michael@0 | 914 | * Otherwise we try normalizing the first composite |
michael@0 | 915 | * character and find the immediate decomposed character before the match to |
michael@0 | 916 | * see if it is an non-ignorable accent. |
michael@0 | 917 | * Now normalizing the first composite character is enough because we ensure |
michael@0 | 918 | * that when the match is passed in here with extra beginning ces, the |
michael@0 | 919 | * first or last ce that match has to occur within the first character. |
michael@0 | 920 | * E.g. looking for \u0301 acute in \u01FA A ring above and acute, |
michael@0 | 921 | * checkExtraMatchAccent should fail since there is a middle ring in \u01FA |
michael@0 | 922 | * Note here that accents checking are slow and cautioned in the API docs. |
michael@0 | 923 | * @param strsrch string search data |
michael@0 | 924 | * @param start offset |
michael@0 | 925 | * @param end offset |
michael@0 | 926 | * @return TRUE if there are accents on either side of the match, |
michael@0 | 927 | * FALSE otherwise |
michael@0 | 928 | */ |
michael@0 | 929 | static |
michael@0 | 930 | UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, |
michael@0 | 931 | int32_t end) |
michael@0 | 932 | { |
michael@0 | 933 | if (strsrch->pattern.hasPrefixAccents) { |
michael@0 | 934 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 935 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 936 | // we have been iterating forwards previously |
michael@0 | 937 | uint32_t ignorable = TRUE; |
michael@0 | 938 | int32_t firstce = strsrch->pattern.CE[0]; |
michael@0 | 939 | |
michael@0 | 940 | setColEIterOffset(coleiter, start); |
michael@0 | 941 | int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); |
michael@0 | 942 | if (U_FAILURE(status)) { |
michael@0 | 943 | return TRUE; |
michael@0 | 944 | } |
michael@0 | 945 | while (ce != firstce) { |
michael@0 | 946 | if (ce != UCOL_IGNORABLE) { |
michael@0 | 947 | ignorable = FALSE; |
michael@0 | 948 | } |
michael@0 | 949 | ce = getCE(strsrch, ucol_next(coleiter, &status)); |
michael@0 | 950 | if (U_FAILURE(status) || ce == UCOL_NULLORDER) { |
michael@0 | 951 | return TRUE; |
michael@0 | 952 | } |
michael@0 | 953 | } |
michael@0 | 954 | if (!ignorable && inNormBuf(coleiter)) { |
michael@0 | 955 | // within normalization buffer, discontiguous handled here |
michael@0 | 956 | return TRUE; |
michael@0 | 957 | } |
michael@0 | 958 | |
michael@0 | 959 | // within text |
michael@0 | 960 | int32_t temp = start; |
michael@0 | 961 | // original code |
michael@0 | 962 | // accent = (getFCD(strsrch->search->text, &temp, |
michael@0 | 963 | // strsrch->search->textLength) |
michael@0 | 964 | // >> SECOND_LAST_BYTE_SHIFT_); |
michael@0 | 965 | // however this code does not work well with VC7 .net in release mode. |
michael@0 | 966 | // maybe the inlines for getFCD combined with shifting has bugs in |
michael@0 | 967 | // VC7. anyways this is a work around. |
michael@0 | 968 | UBool accent = getFCD(strsrch->search->text, &temp, |
michael@0 | 969 | strsrch->search->textLength) > 0xFF; |
michael@0 | 970 | if (!accent) { |
michael@0 | 971 | return checkExtraMatchAccents(strsrch, start, end, &status); |
michael@0 | 972 | } |
michael@0 | 973 | if (!ignorable) { |
michael@0 | 974 | return TRUE; |
michael@0 | 975 | } |
michael@0 | 976 | if (start > 0) { |
michael@0 | 977 | temp = start; |
michael@0 | 978 | U16_BACK_1(strsrch->search->text, 0, temp); |
michael@0 | 979 | if (getFCD(strsrch->search->text, &temp, |
michael@0 | 980 | strsrch->search->textLength) & LAST_BYTE_MASK_) { |
michael@0 | 981 | setColEIterOffset(coleiter, start); |
michael@0 | 982 | ce = ucol_previous(coleiter, &status); |
michael@0 | 983 | if (U_FAILURE(status) || |
michael@0 | 984 | (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { |
michael@0 | 985 | return TRUE; |
michael@0 | 986 | } |
michael@0 | 987 | } |
michael@0 | 988 | } |
michael@0 | 989 | } |
michael@0 | 990 | |
michael@0 | 991 | return FALSE; |
michael@0 | 992 | } |
michael@0 | 993 | |
michael@0 | 994 | /** |
michael@0 | 995 | * Used by exact matches, checks if there are accents bounding the match. |
michael@0 | 996 | * Note this is the initial boundary check. If the potential match |
michael@0 | 997 | * starts or ends with composite characters, the accents in those |
michael@0 | 998 | * characters will be determined later. |
michael@0 | 999 | * Not doing backwards iteration here, since discontiguos contraction for |
michael@0 | 1000 | * backwards collation element iterator, use up too many characters. |
michael@0 | 1001 | * E.g. looking for \u030A ring in \u01FA A ring above and acute, |
michael@0 | 1002 | * should fail since there is a acute at the end of \u01FA |
michael@0 | 1003 | * Note here that accents checking are slow and cautioned in the API docs. |
michael@0 | 1004 | * @param strsrch string search data |
michael@0 | 1005 | * @param start offset of match |
michael@0 | 1006 | * @param end end offset of the match |
michael@0 | 1007 | * @return TRUE if there are accents on either side of the match, |
michael@0 | 1008 | * FALSE otherwise |
michael@0 | 1009 | */ |
michael@0 | 1010 | static |
michael@0 | 1011 | UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, |
michael@0 | 1012 | int32_t end) |
michael@0 | 1013 | { |
michael@0 | 1014 | if (strsrch->pattern.hasSuffixAccents) { |
michael@0 | 1015 | const UChar *text = strsrch->search->text; |
michael@0 | 1016 | int32_t temp = end; |
michael@0 | 1017 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 1018 | U16_BACK_1(text, 0, temp); |
michael@0 | 1019 | if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { |
michael@0 | 1020 | int32_t firstce = strsrch->pattern.CE[0]; |
michael@0 | 1021 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1022 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1023 | int32_t ce; |
michael@0 | 1024 | setColEIterOffset(coleiter, start); |
michael@0 | 1025 | while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { |
michael@0 | 1026 | if (U_FAILURE(status) || ce == UCOL_NULLORDER) { |
michael@0 | 1027 | return TRUE; |
michael@0 | 1028 | } |
michael@0 | 1029 | } |
michael@0 | 1030 | int32_t count = 1; |
michael@0 | 1031 | while (count < strsrch->pattern.CELength) { |
michael@0 | 1032 | if (getCE(strsrch, ucol_next(coleiter, &status)) |
michael@0 | 1033 | == UCOL_IGNORABLE) { |
michael@0 | 1034 | // Thai can give an ignorable here. |
michael@0 | 1035 | count --; |
michael@0 | 1036 | } |
michael@0 | 1037 | if (U_FAILURE(status)) { |
michael@0 | 1038 | return TRUE; |
michael@0 | 1039 | } |
michael@0 | 1040 | count ++; |
michael@0 | 1041 | } |
michael@0 | 1042 | |
michael@0 | 1043 | ce = ucol_next(coleiter, &status); |
michael@0 | 1044 | if (U_FAILURE(status)) { |
michael@0 | 1045 | return TRUE; |
michael@0 | 1046 | } |
michael@0 | 1047 | if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { |
michael@0 | 1048 | ce = getCE(strsrch, ce); |
michael@0 | 1049 | } |
michael@0 | 1050 | if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { |
michael@0 | 1051 | if (ucol_getOffset(coleiter) <= end) { |
michael@0 | 1052 | return TRUE; |
michael@0 | 1053 | } |
michael@0 | 1054 | if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { |
michael@0 | 1055 | return TRUE; |
michael@0 | 1056 | } |
michael@0 | 1057 | } |
michael@0 | 1058 | } |
michael@0 | 1059 | } |
michael@0 | 1060 | return FALSE; |
michael@0 | 1061 | } |
michael@0 | 1062 | #endif // #if BOYER_MOORE |
michael@0 | 1063 | |
michael@0 | 1064 | /** |
michael@0 | 1065 | * Checks if the offset runs out of the text string |
michael@0 | 1066 | * @param offset |
michael@0 | 1067 | * @param textlength of the text string |
michael@0 | 1068 | * @return TRUE if offset is out of bounds, FALSE otherwise |
michael@0 | 1069 | */ |
michael@0 | 1070 | static |
michael@0 | 1071 | inline UBool isOutOfBounds(int32_t textlength, int32_t offset) |
michael@0 | 1072 | { |
michael@0 | 1073 | return offset < 0 || offset > textlength; |
michael@0 | 1074 | } |
michael@0 | 1075 | |
michael@0 | 1076 | /** |
michael@0 | 1077 | * Checks for identical match |
michael@0 | 1078 | * @param strsrch string search data |
michael@0 | 1079 | * @param start offset of possible match |
michael@0 | 1080 | * @param end offset of possible match |
michael@0 | 1081 | * @return TRUE if identical match is found |
michael@0 | 1082 | */ |
michael@0 | 1083 | static |
michael@0 | 1084 | inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, |
michael@0 | 1085 | int32_t end) |
michael@0 | 1086 | { |
michael@0 | 1087 | if (strsrch->strength != UCOL_IDENTICAL) { |
michael@0 | 1088 | return TRUE; |
michael@0 | 1089 | } |
michael@0 | 1090 | |
michael@0 | 1091 | // Note: We could use Normalizer::compare() or similar, but for short strings |
michael@0 | 1092 | // which may not be in FCD it might be faster to just NFD them. |
michael@0 | 1093 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1094 | UnicodeString t2, p2; |
michael@0 | 1095 | strsrch->nfd->normalize( |
michael@0 | 1096 | UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); |
michael@0 | 1097 | strsrch->nfd->normalize( |
michael@0 | 1098 | UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); |
michael@0 | 1099 | // return FALSE if NFD failed |
michael@0 | 1100 | return U_SUCCESS(status) && t2 == p2; |
michael@0 | 1101 | } |
michael@0 | 1102 | |
michael@0 | 1103 | #if BOYER_MOORE |
michael@0 | 1104 | /** |
michael@0 | 1105 | * Checks to see if the match is repeated |
michael@0 | 1106 | * @param strsrch string search data |
michael@0 | 1107 | * @param start new match start index |
michael@0 | 1108 | * @param end new match end index |
michael@0 | 1109 | * @return TRUE if the the match is repeated, FALSE otherwise |
michael@0 | 1110 | */ |
michael@0 | 1111 | static |
michael@0 | 1112 | inline UBool checkRepeatedMatch(UStringSearch *strsrch, |
michael@0 | 1113 | int32_t start, |
michael@0 | 1114 | int32_t end) |
michael@0 | 1115 | { |
michael@0 | 1116 | int32_t lastmatchindex = strsrch->search->matchedIndex; |
michael@0 | 1117 | UBool result; |
michael@0 | 1118 | if (lastmatchindex == USEARCH_DONE) { |
michael@0 | 1119 | return FALSE; |
michael@0 | 1120 | } |
michael@0 | 1121 | if (strsrch->search->isForwardSearching) { |
michael@0 | 1122 | result = start <= lastmatchindex; |
michael@0 | 1123 | } |
michael@0 | 1124 | else { |
michael@0 | 1125 | result = start >= lastmatchindex; |
michael@0 | 1126 | } |
michael@0 | 1127 | if (!result && !strsrch->search->isOverlap) { |
michael@0 | 1128 | if (strsrch->search->isForwardSearching) { |
michael@0 | 1129 | result = start < lastmatchindex + strsrch->search->matchedLength; |
michael@0 | 1130 | } |
michael@0 | 1131 | else { |
michael@0 | 1132 | result = end > lastmatchindex; |
michael@0 | 1133 | } |
michael@0 | 1134 | } |
michael@0 | 1135 | return result; |
michael@0 | 1136 | } |
michael@0 | 1137 | |
michael@0 | 1138 | /** |
michael@0 | 1139 | * Gets the collation element iterator's current offset. |
michael@0 | 1140 | * @param coleiter collation element iterator |
michael@0 | 1141 | * @param forwards flag TRUE if we are moving in th forwards direction |
michael@0 | 1142 | * @return current offset |
michael@0 | 1143 | */ |
michael@0 | 1144 | static |
michael@0 | 1145 | inline int32_t getColElemIterOffset(const UCollationElements *coleiter, |
michael@0 | 1146 | UBool forwards) |
michael@0 | 1147 | { |
michael@0 | 1148 | int32_t result = ucol_getOffset(coleiter); |
michael@0 | 1149 | // intricacies of the the backwards collation element iterator |
michael@0 | 1150 | if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { |
michael@0 | 1151 | result ++; |
michael@0 | 1152 | } |
michael@0 | 1153 | return result; |
michael@0 | 1154 | } |
michael@0 | 1155 | |
michael@0 | 1156 | /** |
michael@0 | 1157 | * Checks match for contraction. |
michael@0 | 1158 | * If the match ends with a partial contraction we fail. |
michael@0 | 1159 | * If the match starts too far off (because of backwards iteration) we try to |
michael@0 | 1160 | * chip off the extra characters depending on whether a breakiterator has |
michael@0 | 1161 | * been used. |
michael@0 | 1162 | * Internal method, error assumed to be success, caller has to check status |
michael@0 | 1163 | * before calling this method. |
michael@0 | 1164 | * @param strsrch string search data |
michael@0 | 1165 | * @param start offset of potential match, to be modified if necessary |
michael@0 | 1166 | * @param end offset of potential match, to be modified if necessary |
michael@0 | 1167 | * @param status output error status if any |
michael@0 | 1168 | * @return TRUE if match passes the contraction test, FALSE otherwise |
michael@0 | 1169 | */ |
michael@0 | 1170 | |
michael@0 | 1171 | static |
michael@0 | 1172 | UBool checkNextExactContractionMatch(UStringSearch *strsrch, |
michael@0 | 1173 | int32_t *start, |
michael@0 | 1174 | int32_t *end, UErrorCode *status) |
michael@0 | 1175 | { |
michael@0 | 1176 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1177 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 1178 | int32_t temp = *start; |
michael@0 | 1179 | const UCollator *collator = strsrch->collator; |
michael@0 | 1180 | const UChar *text = strsrch->search->text; |
michael@0 | 1181 | // This part checks if either ends of the match contains potential |
michael@0 | 1182 | // contraction. If so we'll have to iterate through them |
michael@0 | 1183 | // The start contraction needs to be checked since ucol_previous dumps |
michael@0 | 1184 | // all characters till the first safe character into the buffer. |
michael@0 | 1185 | // *start + 1 is used to test for the unsafe characters instead of *start |
michael@0 | 1186 | // because ucol_prev takes all unsafe characters till the first safe |
michael@0 | 1187 | // character ie *start. so by testing *start + 1, we can estimate if |
michael@0 | 1188 | // excess prefix characters has been included in the potential search |
michael@0 | 1189 | // results. |
michael@0 | 1190 | if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || |
michael@0 | 1191 | (*start + 1 < textlength |
michael@0 | 1192 | && ucol_unsafeCP(text[*start + 1], collator))) { |
michael@0 | 1193 | int32_t expansion = getExpansionPrefix(coleiter); |
michael@0 | 1194 | UBool expandflag = expansion > 0; |
michael@0 | 1195 | setColEIterOffset(coleiter, *start); |
michael@0 | 1196 | while (expansion > 0) { |
michael@0 | 1197 | // getting rid of the redundant ce, caused by setOffset. |
michael@0 | 1198 | // since backward contraction/expansion may have extra ces if we |
michael@0 | 1199 | // are in the normalization buffer, hasAccentsBeforeMatch would |
michael@0 | 1200 | // have taken care of it. |
michael@0 | 1201 | // E.g. the character \u01FA will have an expansion of 3, but if |
michael@0 | 1202 | // we are only looking for acute and ring \u030A and \u0301, we'll |
michael@0 | 1203 | // have to skip the first ce in the expansion buffer. |
michael@0 | 1204 | ucol_next(coleiter, status); |
michael@0 | 1205 | if (U_FAILURE(*status)) { |
michael@0 | 1206 | return FALSE; |
michael@0 | 1207 | } |
michael@0 | 1208 | if (ucol_getOffset(coleiter) != temp) { |
michael@0 | 1209 | *start = temp; |
michael@0 | 1210 | temp = ucol_getOffset(coleiter); |
michael@0 | 1211 | } |
michael@0 | 1212 | expansion --; |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 1216 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 1217 | int32_t count = 0; |
michael@0 | 1218 | while (count < patterncelength) { |
michael@0 | 1219 | int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); |
michael@0 | 1220 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 1221 | continue; |
michael@0 | 1222 | } |
michael@0 | 1223 | if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { |
michael@0 | 1224 | *start = temp; |
michael@0 | 1225 | temp = ucol_getOffset(coleiter); |
michael@0 | 1226 | } |
michael@0 | 1227 | if (U_FAILURE(*status) || ce != patternce[count]) { |
michael@0 | 1228 | (*end) ++; |
michael@0 | 1229 | *end = getNextUStringSearchBaseOffset(strsrch, *end); |
michael@0 | 1230 | return FALSE; |
michael@0 | 1231 | } |
michael@0 | 1232 | count ++; |
michael@0 | 1233 | } |
michael@0 | 1234 | } |
michael@0 | 1235 | return TRUE; |
michael@0 | 1236 | } |
michael@0 | 1237 | |
michael@0 | 1238 | /** |
michael@0 | 1239 | * Checks and sets the match information if found. |
michael@0 | 1240 | * Checks |
michael@0 | 1241 | * <ul> |
michael@0 | 1242 | * <li> the potential match does not repeat the previous match |
michael@0 | 1243 | * <li> boundaries are correct |
michael@0 | 1244 | * <li> exact matches has no extra accents |
michael@0 | 1245 | * <li> identical matchesb |
michael@0 | 1246 | * <li> potential match does not end in the middle of a contraction |
michael@0 | 1247 | * <\ul> |
michael@0 | 1248 | * Otherwise the offset will be shifted to the next character. |
michael@0 | 1249 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1250 | * before calling this method. |
michael@0 | 1251 | * @param strsrch string search data |
michael@0 | 1252 | * @param textoffset offset in the collation element text. the returned value |
michael@0 | 1253 | * will be the truncated end offset of the match or the new start |
michael@0 | 1254 | * search offset. |
michael@0 | 1255 | * @param status output error status if any |
michael@0 | 1256 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 1257 | */ |
michael@0 | 1258 | static |
michael@0 | 1259 | inline UBool checkNextExactMatch(UStringSearch *strsrch, |
michael@0 | 1260 | int32_t *textoffset, UErrorCode *status) |
michael@0 | 1261 | { |
michael@0 | 1262 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1263 | int32_t start = getColElemIterOffset(coleiter, FALSE); |
michael@0 | 1264 | |
michael@0 | 1265 | if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { |
michael@0 | 1266 | return FALSE; |
michael@0 | 1267 | } |
michael@0 | 1268 | |
michael@0 | 1269 | // this totally matches, however we need to check if it is repeating |
michael@0 | 1270 | if (!isBreakUnit(strsrch, start, *textoffset) || |
michael@0 | 1271 | checkRepeatedMatch(strsrch, start, *textoffset) || |
michael@0 | 1272 | hasAccentsBeforeMatch(strsrch, start, *textoffset) || |
michael@0 | 1273 | !checkIdentical(strsrch, start, *textoffset) || |
michael@0 | 1274 | hasAccentsAfterMatch(strsrch, start, *textoffset)) { |
michael@0 | 1275 | |
michael@0 | 1276 | (*textoffset) ++; |
michael@0 | 1277 | *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); |
michael@0 | 1278 | return FALSE; |
michael@0 | 1279 | } |
michael@0 | 1280 | |
michael@0 | 1281 | //Add breakiterator boundary check for primary strength search. |
michael@0 | 1282 | if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { |
michael@0 | 1283 | checkBreakBoundary(strsrch, &start, textoffset); |
michael@0 | 1284 | } |
michael@0 | 1285 | |
michael@0 | 1286 | // totally match, we will get rid of the ending ignorables. |
michael@0 | 1287 | strsrch->search->matchedIndex = start; |
michael@0 | 1288 | strsrch->search->matchedLength = *textoffset - start; |
michael@0 | 1289 | return TRUE; |
michael@0 | 1290 | } |
michael@0 | 1291 | |
michael@0 | 1292 | /** |
michael@0 | 1293 | * Getting the previous base character offset, or the current offset if the |
michael@0 | 1294 | * current character is a base character |
michael@0 | 1295 | * @param text string |
michael@0 | 1296 | * @param textoffset one offset after the current character |
michael@0 | 1297 | * @return the offset of the next character after the base character or the first |
michael@0 | 1298 | * composed character with accents |
michael@0 | 1299 | */ |
michael@0 | 1300 | static |
michael@0 | 1301 | inline int32_t getPreviousBaseOffset(const UChar *text, |
michael@0 | 1302 | int32_t textoffset) |
michael@0 | 1303 | { |
michael@0 | 1304 | if (textoffset > 0) { |
michael@0 | 1305 | for (;;) { |
michael@0 | 1306 | int32_t result = textoffset; |
michael@0 | 1307 | U16_BACK_1(text, 0, textoffset); |
michael@0 | 1308 | int32_t temp = textoffset; |
michael@0 | 1309 | uint16_t fcd = getFCD(text, &temp, result); |
michael@0 | 1310 | if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { |
michael@0 | 1311 | if (fcd & LAST_BYTE_MASK_) { |
michael@0 | 1312 | return textoffset; |
michael@0 | 1313 | } |
michael@0 | 1314 | return result; |
michael@0 | 1315 | } |
michael@0 | 1316 | if (textoffset == 0) { |
michael@0 | 1317 | return 0; |
michael@0 | 1318 | } |
michael@0 | 1319 | } |
michael@0 | 1320 | } |
michael@0 | 1321 | return textoffset; |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | /** |
michael@0 | 1325 | * Getting the indexes of the accents that are not blocked in the argument |
michael@0 | 1326 | * accent array |
michael@0 | 1327 | * @param accents array of accents in nfd terminated by a 0. |
michael@0 | 1328 | * @param accentsindex array of indexes of the accents that are not blocked |
michael@0 | 1329 | */ |
michael@0 | 1330 | static |
michael@0 | 1331 | inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) |
michael@0 | 1332 | { |
michael@0 | 1333 | int32_t index = 0; |
michael@0 | 1334 | int32_t length = u_strlen(accents); |
michael@0 | 1335 | UChar32 codepoint = 0; |
michael@0 | 1336 | int cclass = 0; |
michael@0 | 1337 | int result = 0; |
michael@0 | 1338 | int32_t temp; |
michael@0 | 1339 | while (index < length) { |
michael@0 | 1340 | temp = index; |
michael@0 | 1341 | U16_NEXT(accents, index, length, codepoint); |
michael@0 | 1342 | if (u_getCombiningClass(codepoint) != cclass) { |
michael@0 | 1343 | cclass = u_getCombiningClass(codepoint); |
michael@0 | 1344 | accentsindex[result] = temp; |
michael@0 | 1345 | result ++; |
michael@0 | 1346 | } |
michael@0 | 1347 | } |
michael@0 | 1348 | accentsindex[result] = length; |
michael@0 | 1349 | return result; |
michael@0 | 1350 | } |
michael@0 | 1351 | |
michael@0 | 1352 | /** |
michael@0 | 1353 | * Appends 3 UChar arrays to a destination array. |
michael@0 | 1354 | * Creates a new array if we run out of space. The caller will have to |
michael@0 | 1355 | * manually deallocate the newly allocated array. |
michael@0 | 1356 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1357 | * before calling this method. destination not to be NULL and has at least |
michael@0 | 1358 | * size destinationlength. |
michael@0 | 1359 | * @param destination target array |
michael@0 | 1360 | * @param destinationlength target array size, returning the appended length |
michael@0 | 1361 | * @param source1 null-terminated first array |
michael@0 | 1362 | * @param source2 second array |
michael@0 | 1363 | * @param source2length length of seond array |
michael@0 | 1364 | * @param source3 null-terminated third array |
michael@0 | 1365 | * @param status error status if any |
michael@0 | 1366 | * @return new destination array, destination if there was no new allocation |
michael@0 | 1367 | */ |
michael@0 | 1368 | static |
michael@0 | 1369 | inline UChar * addToUCharArray( UChar *destination, |
michael@0 | 1370 | int32_t *destinationlength, |
michael@0 | 1371 | const UChar *source1, |
michael@0 | 1372 | const UChar *source2, |
michael@0 | 1373 | int32_t source2length, |
michael@0 | 1374 | const UChar *source3, |
michael@0 | 1375 | UErrorCode *status) |
michael@0 | 1376 | { |
michael@0 | 1377 | int32_t source1length = source1 ? u_strlen(source1) : 0; |
michael@0 | 1378 | int32_t source3length = source3 ? u_strlen(source3) : 0; |
michael@0 | 1379 | if (*destinationlength < source1length + source2length + source3length + |
michael@0 | 1380 | 1) |
michael@0 | 1381 | { |
michael@0 | 1382 | destination = (UChar *)allocateMemory( |
michael@0 | 1383 | (source1length + source2length + source3length + 1) * sizeof(UChar), |
michael@0 | 1384 | status); |
michael@0 | 1385 | // if error allocating memory, status will be |
michael@0 | 1386 | // U_MEMORY_ALLOCATION_ERROR |
michael@0 | 1387 | if (U_FAILURE(*status)) { |
michael@0 | 1388 | *destinationlength = 0; |
michael@0 | 1389 | return NULL; |
michael@0 | 1390 | } |
michael@0 | 1391 | } |
michael@0 | 1392 | if (source1length != 0) { |
michael@0 | 1393 | uprv_memcpy(destination, source1, sizeof(UChar) * source1length); |
michael@0 | 1394 | } |
michael@0 | 1395 | if (source2length != 0) { |
michael@0 | 1396 | uprv_memcpy(destination + source1length, source2, |
michael@0 | 1397 | sizeof(UChar) * source2length); |
michael@0 | 1398 | } |
michael@0 | 1399 | if (source3length != 0) { |
michael@0 | 1400 | uprv_memcpy(destination + source1length + source2length, source3, |
michael@0 | 1401 | sizeof(UChar) * source3length); |
michael@0 | 1402 | } |
michael@0 | 1403 | *destinationlength = source1length + source2length + source3length; |
michael@0 | 1404 | return destination; |
michael@0 | 1405 | } |
michael@0 | 1406 | |
michael@0 | 1407 | /** |
michael@0 | 1408 | * Running through a collation element iterator to see if the contents matches |
michael@0 | 1409 | * pattern in string search data |
michael@0 | 1410 | * @param strsrch string search data |
michael@0 | 1411 | * @param coleiter collation element iterator |
michael@0 | 1412 | * @return TRUE if a match if found, FALSE otherwise |
michael@0 | 1413 | */ |
michael@0 | 1414 | static |
michael@0 | 1415 | inline UBool checkCollationMatch(const UStringSearch *strsrch, |
michael@0 | 1416 | UCollationElements *coleiter) |
michael@0 | 1417 | { |
michael@0 | 1418 | int patternceindex = strsrch->pattern.CELength; |
michael@0 | 1419 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 1420 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1421 | while (patternceindex > 0) { |
michael@0 | 1422 | int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); |
michael@0 | 1423 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 1424 | continue; |
michael@0 | 1425 | } |
michael@0 | 1426 | if (U_FAILURE(status) || ce != *patternce) { |
michael@0 | 1427 | return FALSE; |
michael@0 | 1428 | } |
michael@0 | 1429 | patternce ++; |
michael@0 | 1430 | patternceindex --; |
michael@0 | 1431 | } |
michael@0 | 1432 | return TRUE; |
michael@0 | 1433 | } |
michael@0 | 1434 | |
michael@0 | 1435 | /** |
michael@0 | 1436 | * Rearranges the front accents to try matching. |
michael@0 | 1437 | * Prefix accents in the text will be grouped according to their combining |
michael@0 | 1438 | * class and the groups will be mixed and matched to try find the perfect |
michael@0 | 1439 | * match with the pattern. |
michael@0 | 1440 | * So for instance looking for "\u0301" in "\u030A\u0301\u0325" |
michael@0 | 1441 | * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings |
michael@0 | 1442 | * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", |
michael@0 | 1443 | * "\u0301\u0325". |
michael@0 | 1444 | * step 2: check if any of the generated substrings matches the pattern. |
michael@0 | 1445 | * Internal method, status is assumed to be success, caller has to check status |
michael@0 | 1446 | * before calling this method. |
michael@0 | 1447 | * @param strsrch string search match |
michael@0 | 1448 | * @param start first offset of the accents to start searching |
michael@0 | 1449 | * @param end start of the last accent set |
michael@0 | 1450 | * @param status output error status if any |
michael@0 | 1451 | * @return USEARCH_DONE if a match is not found, otherwise return the starting |
michael@0 | 1452 | * offset of the match. Note this start includes all preceding accents. |
michael@0 | 1453 | */ |
michael@0 | 1454 | static |
michael@0 | 1455 | int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, |
michael@0 | 1456 | int32_t start, |
michael@0 | 1457 | int32_t end, |
michael@0 | 1458 | UErrorCode *status) |
michael@0 | 1459 | { |
michael@0 | 1460 | const UChar *text = strsrch->search->text; |
michael@0 | 1461 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 1462 | int32_t tempstart = start; |
michael@0 | 1463 | |
michael@0 | 1464 | if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { |
michael@0 | 1465 | // die... failed at a base character |
michael@0 | 1466 | return USEARCH_DONE; |
michael@0 | 1467 | } |
michael@0 | 1468 | |
michael@0 | 1469 | int32_t offset = getNextBaseOffset(text, tempstart, textlength); |
michael@0 | 1470 | start = getPreviousBaseOffset(text, tempstart); |
michael@0 | 1471 | |
michael@0 | 1472 | UChar accents[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1473 | // normalizing the offensive string |
michael@0 | 1474 | unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, |
michael@0 | 1475 | INITIAL_ARRAY_SIZE_, status); |
michael@0 | 1476 | if (U_FAILURE(*status)) { |
michael@0 | 1477 | return USEARCH_DONE; |
michael@0 | 1478 | } |
michael@0 | 1479 | |
michael@0 | 1480 | int32_t accentsindex[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1481 | int32_t accentsize = getUnblockedAccentIndex(accents, |
michael@0 | 1482 | accentsindex); |
michael@0 | 1483 | int32_t count = (2 << (accentsize - 1)) - 1; |
michael@0 | 1484 | UChar buffer[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1485 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 1486 | while (U_SUCCESS(*status) && count > 0) { |
michael@0 | 1487 | UChar *rearrange = strsrch->canonicalPrefixAccents; |
michael@0 | 1488 | // copy the base characters |
michael@0 | 1489 | for (int k = 0; k < accentsindex[0]; k ++) { |
michael@0 | 1490 | *rearrange ++ = accents[k]; |
michael@0 | 1491 | } |
michael@0 | 1492 | // forming all possible canonical rearrangement by dropping |
michael@0 | 1493 | // sets of accents |
michael@0 | 1494 | for (int i = 0; i <= accentsize - 1; i ++) { |
michael@0 | 1495 | int32_t mask = 1 << (accentsize - i - 1); |
michael@0 | 1496 | if (count & mask) { |
michael@0 | 1497 | for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { |
michael@0 | 1498 | *rearrange ++ = accents[j]; |
michael@0 | 1499 | } |
michael@0 | 1500 | } |
michael@0 | 1501 | } |
michael@0 | 1502 | *rearrange = 0; |
michael@0 | 1503 | int32_t matchsize = INITIAL_ARRAY_SIZE_; |
michael@0 | 1504 | UChar *match = addToUCharArray(buffer, &matchsize, |
michael@0 | 1505 | strsrch->canonicalPrefixAccents, |
michael@0 | 1506 | strsrch->search->text + offset, |
michael@0 | 1507 | end - offset, |
michael@0 | 1508 | strsrch->canonicalSuffixAccents, |
michael@0 | 1509 | status); |
michael@0 | 1510 | |
michael@0 | 1511 | // if status is a failure, ucol_setText does nothing. |
michael@0 | 1512 | // run the collator iterator through this match |
michael@0 | 1513 | ucol_setText(coleiter, match, matchsize, status); |
michael@0 | 1514 | if (U_SUCCESS(*status)) { |
michael@0 | 1515 | if (checkCollationMatch(strsrch, coleiter)) { |
michael@0 | 1516 | if (match != buffer) { |
michael@0 | 1517 | uprv_free(match); |
michael@0 | 1518 | } |
michael@0 | 1519 | return start; |
michael@0 | 1520 | } |
michael@0 | 1521 | } |
michael@0 | 1522 | count --; |
michael@0 | 1523 | } |
michael@0 | 1524 | return USEARCH_DONE; |
michael@0 | 1525 | } |
michael@0 | 1526 | |
michael@0 | 1527 | /** |
michael@0 | 1528 | * Gets the offset to the safe point in text before textoffset. |
michael@0 | 1529 | * ie. not the middle of a contraction, swappable characters or supplementary |
michael@0 | 1530 | * characters. |
michael@0 | 1531 | * @param collator collation sata |
michael@0 | 1532 | * @param text string to work with |
michael@0 | 1533 | * @param textoffset offset in string |
michael@0 | 1534 | * @param textlength length of text string |
michael@0 | 1535 | * @return offset to the previous safe character |
michael@0 | 1536 | */ |
michael@0 | 1537 | static |
michael@0 | 1538 | inline uint32_t getPreviousSafeOffset(const UCollator *collator, |
michael@0 | 1539 | const UChar *text, |
michael@0 | 1540 | int32_t textoffset) |
michael@0 | 1541 | { |
michael@0 | 1542 | int32_t result = textoffset; // first contraction character |
michael@0 | 1543 | while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { |
michael@0 | 1544 | result --; |
michael@0 | 1545 | } |
michael@0 | 1546 | if (result != 0) { |
michael@0 | 1547 | // the first contraction character is consider unsafe here |
michael@0 | 1548 | result --; |
michael@0 | 1549 | } |
michael@0 | 1550 | return result; |
michael@0 | 1551 | } |
michael@0 | 1552 | |
michael@0 | 1553 | /** |
michael@0 | 1554 | * Cleaning up after we passed the safe zone |
michael@0 | 1555 | * @param strsrch string search data |
michael@0 | 1556 | * @param safetext safe text array |
michael@0 | 1557 | * @param safebuffer safe text buffer |
michael@0 | 1558 | * @param coleiter collation element iterator for safe text |
michael@0 | 1559 | */ |
michael@0 | 1560 | static |
michael@0 | 1561 | inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, |
michael@0 | 1562 | UChar *safebuffer) |
michael@0 | 1563 | { |
michael@0 | 1564 | if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) |
michael@0 | 1565 | { |
michael@0 | 1566 | uprv_free(safetext); |
michael@0 | 1567 | } |
michael@0 | 1568 | } |
michael@0 | 1569 | |
michael@0 | 1570 | /** |
michael@0 | 1571 | * Take the rearranged end accents and tries matching. If match failed at |
michael@0 | 1572 | * a seperate preceding set of accents (seperated from the rearranged on by |
michael@0 | 1573 | * at least a base character) then we rearrange the preceding accents and |
michael@0 | 1574 | * tries matching again. |
michael@0 | 1575 | * We allow skipping of the ends of the accent set if the ces do not match. |
michael@0 | 1576 | * However if the failure is found before the accent set, it fails. |
michael@0 | 1577 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1578 | * before calling this method. |
michael@0 | 1579 | * @param strsrch string search data |
michael@0 | 1580 | * @param textoffset of the start of the rearranged accent |
michael@0 | 1581 | * @param status output error status if any |
michael@0 | 1582 | * @return USEARCH_DONE if a match is not found, otherwise return the starting |
michael@0 | 1583 | * offset of the match. Note this start includes all preceding accents. |
michael@0 | 1584 | */ |
michael@0 | 1585 | static |
michael@0 | 1586 | int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, |
michael@0 | 1587 | int32_t textoffset, |
michael@0 | 1588 | UErrorCode *status) |
michael@0 | 1589 | { |
michael@0 | 1590 | const UChar *text = strsrch->search->text; |
michael@0 | 1591 | const UCollator *collator = strsrch->collator; |
michael@0 | 1592 | int32_t safelength = 0; |
michael@0 | 1593 | UChar *safetext; |
michael@0 | 1594 | int32_t safetextlength; |
michael@0 | 1595 | UChar safebuffer[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1596 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 1597 | int32_t safeoffset = textoffset; |
michael@0 | 1598 | |
michael@0 | 1599 | if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], |
michael@0 | 1600 | collator)) { |
michael@0 | 1601 | safeoffset = getPreviousSafeOffset(collator, text, textoffset); |
michael@0 | 1602 | safelength = textoffset - safeoffset; |
michael@0 | 1603 | safetextlength = INITIAL_ARRAY_SIZE_; |
michael@0 | 1604 | safetext = addToUCharArray(safebuffer, &safetextlength, NULL, |
michael@0 | 1605 | text + safeoffset, safelength, |
michael@0 | 1606 | strsrch->canonicalSuffixAccents, |
michael@0 | 1607 | status); |
michael@0 | 1608 | } |
michael@0 | 1609 | else { |
michael@0 | 1610 | safetextlength = u_strlen(strsrch->canonicalSuffixAccents); |
michael@0 | 1611 | safetext = strsrch->canonicalSuffixAccents; |
michael@0 | 1612 | } |
michael@0 | 1613 | |
michael@0 | 1614 | // if status is a failure, ucol_setText does nothing |
michael@0 | 1615 | ucol_setText(coleiter, safetext, safetextlength, status); |
michael@0 | 1616 | // status checked in loop below |
michael@0 | 1617 | |
michael@0 | 1618 | int32_t *ce = strsrch->pattern.CE; |
michael@0 | 1619 | int32_t celength = strsrch->pattern.CELength; |
michael@0 | 1620 | int ceindex = celength - 1; |
michael@0 | 1621 | UBool isSafe = TRUE; // indication flag for position in safe zone |
michael@0 | 1622 | |
michael@0 | 1623 | while (ceindex >= 0) { |
michael@0 | 1624 | int32_t textce = ucol_previous(coleiter, status); |
michael@0 | 1625 | if (U_FAILURE(*status)) { |
michael@0 | 1626 | if (isSafe) { |
michael@0 | 1627 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1628 | } |
michael@0 | 1629 | return USEARCH_DONE; |
michael@0 | 1630 | } |
michael@0 | 1631 | if (textce == UCOL_NULLORDER) { |
michael@0 | 1632 | // check if we have passed the safe buffer |
michael@0 | 1633 | if (coleiter == strsrch->textIter) { |
michael@0 | 1634 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1635 | return USEARCH_DONE; |
michael@0 | 1636 | } |
michael@0 | 1637 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1638 | safetext = safebuffer; |
michael@0 | 1639 | coleiter = strsrch->textIter; |
michael@0 | 1640 | setColEIterOffset(coleiter, safeoffset); |
michael@0 | 1641 | // status checked at the start of the loop |
michael@0 | 1642 | isSafe = FALSE; |
michael@0 | 1643 | continue; |
michael@0 | 1644 | } |
michael@0 | 1645 | textce = getCE(strsrch, textce); |
michael@0 | 1646 | if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { |
michael@0 | 1647 | // do the beginning stuff |
michael@0 | 1648 | int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); |
michael@0 | 1649 | if (isSafe && failedoffset >= safelength) { |
michael@0 | 1650 | // alas... no hope. failed at rearranged accent set |
michael@0 | 1651 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1652 | return USEARCH_DONE; |
michael@0 | 1653 | } |
michael@0 | 1654 | else { |
michael@0 | 1655 | if (isSafe) { |
michael@0 | 1656 | failedoffset += safeoffset; |
michael@0 | 1657 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1658 | } |
michael@0 | 1659 | |
michael@0 | 1660 | // try rearranging the front accents |
michael@0 | 1661 | int32_t result = doNextCanonicalPrefixMatch(strsrch, |
michael@0 | 1662 | failedoffset, textoffset, status); |
michael@0 | 1663 | if (result != USEARCH_DONE) { |
michael@0 | 1664 | // if status is a failure, ucol_setOffset does nothing |
michael@0 | 1665 | setColEIterOffset(strsrch->textIter, result); |
michael@0 | 1666 | } |
michael@0 | 1667 | if (U_FAILURE(*status)) { |
michael@0 | 1668 | return USEARCH_DONE; |
michael@0 | 1669 | } |
michael@0 | 1670 | return result; |
michael@0 | 1671 | } |
michael@0 | 1672 | } |
michael@0 | 1673 | if (textce == ce[ceindex]) { |
michael@0 | 1674 | ceindex --; |
michael@0 | 1675 | } |
michael@0 | 1676 | } |
michael@0 | 1677 | // set offset here |
michael@0 | 1678 | if (isSafe) { |
michael@0 | 1679 | int32_t result = getColElemIterOffset(coleiter, FALSE); |
michael@0 | 1680 | // sets the text iterator here with the correct expansion and offset |
michael@0 | 1681 | int32_t leftoverces = getExpansionPrefix(coleiter); |
michael@0 | 1682 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 1683 | if (result >= safelength) { |
michael@0 | 1684 | result = textoffset; |
michael@0 | 1685 | } |
michael@0 | 1686 | else { |
michael@0 | 1687 | result += safeoffset; |
michael@0 | 1688 | } |
michael@0 | 1689 | setColEIterOffset(strsrch->textIter, result); |
michael@0 | 1690 | strsrch->textIter->iteratordata_.toReturn = |
michael@0 | 1691 | setExpansionPrefix(strsrch->textIter, leftoverces); |
michael@0 | 1692 | return result; |
michael@0 | 1693 | } |
michael@0 | 1694 | |
michael@0 | 1695 | return ucol_getOffset(coleiter); |
michael@0 | 1696 | } |
michael@0 | 1697 | |
michael@0 | 1698 | /** |
michael@0 | 1699 | * Trying out the substring and sees if it can be a canonical match. |
michael@0 | 1700 | * This will try normalizing the end accents and arranging them into canonical |
michael@0 | 1701 | * equivalents and check their corresponding ces with the pattern ce. |
michael@0 | 1702 | * Suffix accents in the text will be grouped according to their combining |
michael@0 | 1703 | * class and the groups will be mixed and matched to try find the perfect |
michael@0 | 1704 | * match with the pattern. |
michael@0 | 1705 | * So for instance looking for "\u0301" in "\u030A\u0301\u0325" |
michael@0 | 1706 | * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings |
michael@0 | 1707 | * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", |
michael@0 | 1708 | * "\u0301\u0325". |
michael@0 | 1709 | * step 2: check if any of the generated substrings matches the pattern. |
michael@0 | 1710 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1711 | * before calling this method. |
michael@0 | 1712 | * @param strsrch string search data |
michael@0 | 1713 | * @param textoffset end offset in the collation element text that ends with |
michael@0 | 1714 | * the accents to be rearranged |
michael@0 | 1715 | * @param status error status if any |
michael@0 | 1716 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 1717 | */ |
michael@0 | 1718 | static |
michael@0 | 1719 | UBool doNextCanonicalMatch(UStringSearch *strsrch, |
michael@0 | 1720 | int32_t textoffset, |
michael@0 | 1721 | UErrorCode *status) |
michael@0 | 1722 | { |
michael@0 | 1723 | const UChar *text = strsrch->search->text; |
michael@0 | 1724 | int32_t temp = textoffset; |
michael@0 | 1725 | U16_BACK_1(text, 0, temp); |
michael@0 | 1726 | if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { |
michael@0 | 1727 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1728 | int32_t offset = getColElemIterOffset(coleiter, FALSE); |
michael@0 | 1729 | if (strsrch->pattern.hasPrefixAccents) { |
michael@0 | 1730 | offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, |
michael@0 | 1731 | status); |
michael@0 | 1732 | if (U_SUCCESS(*status) && offset != USEARCH_DONE) { |
michael@0 | 1733 | setColEIterOffset(coleiter, offset); |
michael@0 | 1734 | return TRUE; |
michael@0 | 1735 | } |
michael@0 | 1736 | } |
michael@0 | 1737 | return FALSE; |
michael@0 | 1738 | } |
michael@0 | 1739 | |
michael@0 | 1740 | if (!strsrch->pattern.hasSuffixAccents) { |
michael@0 | 1741 | return FALSE; |
michael@0 | 1742 | } |
michael@0 | 1743 | |
michael@0 | 1744 | UChar accents[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1745 | // offset to the last base character in substring to search |
michael@0 | 1746 | int32_t baseoffset = getPreviousBaseOffset(text, textoffset); |
michael@0 | 1747 | // normalizing the offensive string |
michael@0 | 1748 | unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, |
michael@0 | 1749 | 0, accents, INITIAL_ARRAY_SIZE_, status); |
michael@0 | 1750 | // status checked in loop below |
michael@0 | 1751 | |
michael@0 | 1752 | int32_t accentsindex[INITIAL_ARRAY_SIZE_]; |
michael@0 | 1753 | int32_t size = getUnblockedAccentIndex(accents, accentsindex); |
michael@0 | 1754 | |
michael@0 | 1755 | // 2 power n - 1 plus the full set of accents |
michael@0 | 1756 | int32_t count = (2 << (size - 1)) - 1; |
michael@0 | 1757 | while (U_SUCCESS(*status) && count > 0) { |
michael@0 | 1758 | UChar *rearrange = strsrch->canonicalSuffixAccents; |
michael@0 | 1759 | // copy the base characters |
michael@0 | 1760 | for (int k = 0; k < accentsindex[0]; k ++) { |
michael@0 | 1761 | *rearrange ++ = accents[k]; |
michael@0 | 1762 | } |
michael@0 | 1763 | // forming all possible canonical rearrangement by dropping |
michael@0 | 1764 | // sets of accents |
michael@0 | 1765 | for (int i = 0; i <= size - 1; i ++) { |
michael@0 | 1766 | int32_t mask = 1 << (size - i - 1); |
michael@0 | 1767 | if (count & mask) { |
michael@0 | 1768 | for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { |
michael@0 | 1769 | *rearrange ++ = accents[j]; |
michael@0 | 1770 | } |
michael@0 | 1771 | } |
michael@0 | 1772 | } |
michael@0 | 1773 | *rearrange = 0; |
michael@0 | 1774 | int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, |
michael@0 | 1775 | status); |
michael@0 | 1776 | if (offset != USEARCH_DONE) { |
michael@0 | 1777 | return TRUE; // match found |
michael@0 | 1778 | } |
michael@0 | 1779 | count --; |
michael@0 | 1780 | } |
michael@0 | 1781 | return FALSE; |
michael@0 | 1782 | } |
michael@0 | 1783 | |
michael@0 | 1784 | /** |
michael@0 | 1785 | * Gets the previous base character offset depending on the string search |
michael@0 | 1786 | * pattern data |
michael@0 | 1787 | * @param strsrch string search data |
michael@0 | 1788 | * @param textoffset current offset, current character |
michael@0 | 1789 | * @return the offset of the next character after this base character or itself |
michael@0 | 1790 | * if it is a composed character with accents |
michael@0 | 1791 | */ |
michael@0 | 1792 | static |
michael@0 | 1793 | inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, |
michael@0 | 1794 | int32_t textoffset) |
michael@0 | 1795 | { |
michael@0 | 1796 | if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { |
michael@0 | 1797 | const UChar *text = strsrch->search->text; |
michael@0 | 1798 | int32_t offset = textoffset; |
michael@0 | 1799 | if (getFCD(text, &offset, strsrch->search->textLength) >> |
michael@0 | 1800 | SECOND_LAST_BYTE_SHIFT_) { |
michael@0 | 1801 | return getPreviousBaseOffset(text, textoffset); |
michael@0 | 1802 | } |
michael@0 | 1803 | } |
michael@0 | 1804 | return textoffset; |
michael@0 | 1805 | } |
michael@0 | 1806 | |
michael@0 | 1807 | /** |
michael@0 | 1808 | * Checks match for contraction. |
michael@0 | 1809 | * If the match ends with a partial contraction we fail. |
michael@0 | 1810 | * If the match starts too far off (because of backwards iteration) we try to |
michael@0 | 1811 | * chip off the extra characters |
michael@0 | 1812 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1813 | * before calling this method. |
michael@0 | 1814 | * @param strsrch string search data |
michael@0 | 1815 | * @param start offset of potential match, to be modified if necessary |
michael@0 | 1816 | * @param end offset of potential match, to be modified if necessary |
michael@0 | 1817 | * @param status output error status if any |
michael@0 | 1818 | * @return TRUE if match passes the contraction test, FALSE otherwise |
michael@0 | 1819 | */ |
michael@0 | 1820 | static |
michael@0 | 1821 | UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, |
michael@0 | 1822 | int32_t *start, |
michael@0 | 1823 | int32_t *end, |
michael@0 | 1824 | UErrorCode *status) |
michael@0 | 1825 | { |
michael@0 | 1826 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1827 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 1828 | int32_t temp = *start; |
michael@0 | 1829 | const UCollator *collator = strsrch->collator; |
michael@0 | 1830 | const UChar *text = strsrch->search->text; |
michael@0 | 1831 | // This part checks if either ends of the match contains potential |
michael@0 | 1832 | // contraction. If so we'll have to iterate through them |
michael@0 | 1833 | if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || |
michael@0 | 1834 | (*start + 1 < textlength |
michael@0 | 1835 | && ucol_unsafeCP(text[*start + 1], collator))) { |
michael@0 | 1836 | int32_t expansion = getExpansionPrefix(coleiter); |
michael@0 | 1837 | UBool expandflag = expansion > 0; |
michael@0 | 1838 | setColEIterOffset(coleiter, *start); |
michael@0 | 1839 | while (expansion > 0) { |
michael@0 | 1840 | // getting rid of the redundant ce, caused by setOffset. |
michael@0 | 1841 | // since backward contraction/expansion may have extra ces if we |
michael@0 | 1842 | // are in the normalization buffer, hasAccentsBeforeMatch would |
michael@0 | 1843 | // have taken care of it. |
michael@0 | 1844 | // E.g. the character \u01FA will have an expansion of 3, but if |
michael@0 | 1845 | // we are only looking for acute and ring \u030A and \u0301, we'll |
michael@0 | 1846 | // have to skip the first ce in the expansion buffer. |
michael@0 | 1847 | ucol_next(coleiter, status); |
michael@0 | 1848 | if (U_FAILURE(*status)) { |
michael@0 | 1849 | return FALSE; |
michael@0 | 1850 | } |
michael@0 | 1851 | if (ucol_getOffset(coleiter) != temp) { |
michael@0 | 1852 | *start = temp; |
michael@0 | 1853 | temp = ucol_getOffset(coleiter); |
michael@0 | 1854 | } |
michael@0 | 1855 | expansion --; |
michael@0 | 1856 | } |
michael@0 | 1857 | |
michael@0 | 1858 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 1859 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 1860 | int32_t count = 0; |
michael@0 | 1861 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 1862 | while (count < patterncelength) { |
michael@0 | 1863 | int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); |
michael@0 | 1864 | // status checked below, note that if status is a failure |
michael@0 | 1865 | // ucol_next returns UCOL_NULLORDER |
michael@0 | 1866 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 1867 | continue; |
michael@0 | 1868 | } |
michael@0 | 1869 | if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { |
michael@0 | 1870 | *start = temp; |
michael@0 | 1871 | temp = ucol_getOffset(coleiter); |
michael@0 | 1872 | } |
michael@0 | 1873 | |
michael@0 | 1874 | if (count == 0 && ce != patternce[0]) { |
michael@0 | 1875 | // accents may have extra starting ces, this occurs when a |
michael@0 | 1876 | // pure accent pattern is matched without rearrangement |
michael@0 | 1877 | // text \u0325\u0300 and looking for \u0300 |
michael@0 | 1878 | int32_t expected = patternce[0]; |
michael@0 | 1879 | if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { |
michael@0 | 1880 | ce = getCE(strsrch, ucol_next(coleiter, status)); |
michael@0 | 1881 | while (U_SUCCESS(*status) && ce != expected && |
michael@0 | 1882 | ce != UCOL_NULLORDER && |
michael@0 | 1883 | ucol_getOffset(coleiter) <= *end) { |
michael@0 | 1884 | ce = getCE(strsrch, ucol_next(coleiter, status)); |
michael@0 | 1885 | } |
michael@0 | 1886 | } |
michael@0 | 1887 | } |
michael@0 | 1888 | if (U_FAILURE(*status) || ce != patternce[count]) { |
michael@0 | 1889 | (*end) ++; |
michael@0 | 1890 | *end = getNextUStringSearchBaseOffset(strsrch, *end); |
michael@0 | 1891 | return FALSE; |
michael@0 | 1892 | } |
michael@0 | 1893 | count ++; |
michael@0 | 1894 | } |
michael@0 | 1895 | } |
michael@0 | 1896 | return TRUE; |
michael@0 | 1897 | } |
michael@0 | 1898 | |
michael@0 | 1899 | /** |
michael@0 | 1900 | * Checks and sets the match information if found. |
michael@0 | 1901 | * Checks |
michael@0 | 1902 | * <ul> |
michael@0 | 1903 | * <li> the potential match does not repeat the previous match |
michael@0 | 1904 | * <li> boundaries are correct |
michael@0 | 1905 | * <li> potential match does not end in the middle of a contraction |
michael@0 | 1906 | * <li> identical matches |
michael@0 | 1907 | * <\ul> |
michael@0 | 1908 | * Otherwise the offset will be shifted to the next character. |
michael@0 | 1909 | * Internal method, status assumed to be success, caller has to check the |
michael@0 | 1910 | * status before calling this method. |
michael@0 | 1911 | * @param strsrch string search data |
michael@0 | 1912 | * @param textoffset offset in the collation element text. the returned value |
michael@0 | 1913 | * will be the truncated end offset of the match or the new start |
michael@0 | 1914 | * search offset. |
michael@0 | 1915 | * @param status output error status if any |
michael@0 | 1916 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 1917 | */ |
michael@0 | 1918 | static |
michael@0 | 1919 | inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, |
michael@0 | 1920 | int32_t *textoffset, |
michael@0 | 1921 | UErrorCode *status) |
michael@0 | 1922 | { |
michael@0 | 1923 | // to ensure that the start and ends are not composite characters |
michael@0 | 1924 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 1925 | // if we have a canonical accent match |
michael@0 | 1926 | if ((strsrch->pattern.hasSuffixAccents && |
michael@0 | 1927 | strsrch->canonicalSuffixAccents[0]) || |
michael@0 | 1928 | (strsrch->pattern.hasPrefixAccents && |
michael@0 | 1929 | strsrch->canonicalPrefixAccents[0])) { |
michael@0 | 1930 | strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( |
michael@0 | 1931 | strsrch, |
michael@0 | 1932 | ucol_getOffset(coleiter)); |
michael@0 | 1933 | strsrch->search->matchedLength = *textoffset - |
michael@0 | 1934 | strsrch->search->matchedIndex; |
michael@0 | 1935 | return TRUE; |
michael@0 | 1936 | } |
michael@0 | 1937 | |
michael@0 | 1938 | int32_t start = getColElemIterOffset(coleiter, FALSE); |
michael@0 | 1939 | if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, |
michael@0 | 1940 | status) || U_FAILURE(*status)) { |
michael@0 | 1941 | return FALSE; |
michael@0 | 1942 | } |
michael@0 | 1943 | |
michael@0 | 1944 | start = getPreviousUStringSearchBaseOffset(strsrch, start); |
michael@0 | 1945 | // this totally matches, however we need to check if it is repeating |
michael@0 | 1946 | if (checkRepeatedMatch(strsrch, start, *textoffset) || |
michael@0 | 1947 | !isBreakUnit(strsrch, start, *textoffset) || |
michael@0 | 1948 | !checkIdentical(strsrch, start, *textoffset)) { |
michael@0 | 1949 | (*textoffset) ++; |
michael@0 | 1950 | *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, |
michael@0 | 1951 | strsrch->search->textLength); |
michael@0 | 1952 | return FALSE; |
michael@0 | 1953 | } |
michael@0 | 1954 | |
michael@0 | 1955 | strsrch->search->matchedIndex = start; |
michael@0 | 1956 | strsrch->search->matchedLength = *textoffset - start; |
michael@0 | 1957 | return TRUE; |
michael@0 | 1958 | } |
michael@0 | 1959 | |
michael@0 | 1960 | /** |
michael@0 | 1961 | * Shifting the collation element iterator position forward to prepare for |
michael@0 | 1962 | * a preceding match. If the first character is a unsafe character, we'll only |
michael@0 | 1963 | * shift by 1 to capture contractions, normalization etc. |
michael@0 | 1964 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 1965 | * before calling this method. |
michael@0 | 1966 | * @param text strsrch string search data |
michael@0 | 1967 | * @param textoffset start text position to do search |
michael@0 | 1968 | * @param ce the text ce which failed the match. |
michael@0 | 1969 | * @param patternceindex index of the ce within the pattern ce buffer which |
michael@0 | 1970 | * failed the match |
michael@0 | 1971 | * @return final offset |
michael@0 | 1972 | */ |
michael@0 | 1973 | static |
michael@0 | 1974 | inline int32_t reverseShift(UStringSearch *strsrch, |
michael@0 | 1975 | int32_t textoffset, |
michael@0 | 1976 | int32_t ce, |
michael@0 | 1977 | int32_t patternceindex) |
michael@0 | 1978 | { |
michael@0 | 1979 | if (strsrch->search->isOverlap) { |
michael@0 | 1980 | if (textoffset != strsrch->search->textLength) { |
michael@0 | 1981 | textoffset --; |
michael@0 | 1982 | } |
michael@0 | 1983 | else { |
michael@0 | 1984 | textoffset -= strsrch->pattern.defaultShiftSize; |
michael@0 | 1985 | } |
michael@0 | 1986 | } |
michael@0 | 1987 | else { |
michael@0 | 1988 | if (ce != UCOL_NULLORDER) { |
michael@0 | 1989 | int32_t shift = strsrch->pattern.backShift[hash(ce)]; |
michael@0 | 1990 | |
michael@0 | 1991 | // this is to adjust for characters in the middle of the substring |
michael@0 | 1992 | // for matching that failed. |
michael@0 | 1993 | int32_t adjust = patternceindex; |
michael@0 | 1994 | if (adjust > 1 && shift > adjust) { |
michael@0 | 1995 | shift -= adjust - 1; |
michael@0 | 1996 | } |
michael@0 | 1997 | textoffset -= shift; |
michael@0 | 1998 | } |
michael@0 | 1999 | else { |
michael@0 | 2000 | textoffset -= strsrch->pattern.defaultShiftSize; |
michael@0 | 2001 | } |
michael@0 | 2002 | } |
michael@0 | 2003 | textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); |
michael@0 | 2004 | return textoffset; |
michael@0 | 2005 | } |
michael@0 | 2006 | |
michael@0 | 2007 | /** |
michael@0 | 2008 | * Checks match for contraction. |
michael@0 | 2009 | * If the match starts with a partial contraction we fail. |
michael@0 | 2010 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2011 | * before calling this method. |
michael@0 | 2012 | * @param strsrch string search data |
michael@0 | 2013 | * @param start offset of potential match, to be modified if necessary |
michael@0 | 2014 | * @param end offset of potential match, to be modified if necessary |
michael@0 | 2015 | * @param status output error status if any |
michael@0 | 2016 | * @return TRUE if match passes the contraction test, FALSE otherwise |
michael@0 | 2017 | */ |
michael@0 | 2018 | static |
michael@0 | 2019 | UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, |
michael@0 | 2020 | int32_t *start, |
michael@0 | 2021 | int32_t *end, UErrorCode *status) |
michael@0 | 2022 | { |
michael@0 | 2023 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 2024 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 2025 | int32_t temp = *end; |
michael@0 | 2026 | const UCollator *collator = strsrch->collator; |
michael@0 | 2027 | const UChar *text = strsrch->search->text; |
michael@0 | 2028 | // This part checks if either if the start of the match contains potential |
michael@0 | 2029 | // contraction. If so we'll have to iterate through them |
michael@0 | 2030 | // Since we used ucol_next while previously looking for the potential |
michael@0 | 2031 | // match, this guarantees that our end will not be a partial contraction, |
michael@0 | 2032 | // or a partial supplementary character. |
michael@0 | 2033 | if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { |
michael@0 | 2034 | int32_t expansion = getExpansionSuffix(coleiter); |
michael@0 | 2035 | UBool expandflag = expansion > 0; |
michael@0 | 2036 | setColEIterOffset(coleiter, *end); |
michael@0 | 2037 | while (U_SUCCESS(*status) && expansion > 0) { |
michael@0 | 2038 | // getting rid of the redundant ce |
michael@0 | 2039 | // since forward contraction/expansion may have extra ces |
michael@0 | 2040 | // if we are in the normalization buffer, hasAccentsBeforeMatch |
michael@0 | 2041 | // would have taken care of it. |
michael@0 | 2042 | // E.g. the character \u01FA will have an expansion of 3, but if |
michael@0 | 2043 | // we are only looking for A ring A\u030A, we'll have to skip the |
michael@0 | 2044 | // last ce in the expansion buffer |
michael@0 | 2045 | ucol_previous(coleiter, status); |
michael@0 | 2046 | if (U_FAILURE(*status)) { |
michael@0 | 2047 | return FALSE; |
michael@0 | 2048 | } |
michael@0 | 2049 | if (ucol_getOffset(coleiter) != temp) { |
michael@0 | 2050 | *end = temp; |
michael@0 | 2051 | temp = ucol_getOffset(coleiter); |
michael@0 | 2052 | } |
michael@0 | 2053 | expansion --; |
michael@0 | 2054 | } |
michael@0 | 2055 | |
michael@0 | 2056 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 2057 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 2058 | int32_t count = patterncelength; |
michael@0 | 2059 | while (count > 0) { |
michael@0 | 2060 | int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); |
michael@0 | 2061 | // status checked below, note that if status is a failure |
michael@0 | 2062 | // ucol_previous returns UCOL_NULLORDER |
michael@0 | 2063 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 2064 | continue; |
michael@0 | 2065 | } |
michael@0 | 2066 | if (expandflag && count == 0 && |
michael@0 | 2067 | getColElemIterOffset(coleiter, FALSE) != temp) { |
michael@0 | 2068 | *end = temp; |
michael@0 | 2069 | temp = ucol_getOffset(coleiter); |
michael@0 | 2070 | } |
michael@0 | 2071 | if (U_FAILURE(*status) || ce != patternce[count - 1]) { |
michael@0 | 2072 | (*start) --; |
michael@0 | 2073 | *start = getPreviousBaseOffset(text, *start); |
michael@0 | 2074 | return FALSE; |
michael@0 | 2075 | } |
michael@0 | 2076 | count --; |
michael@0 | 2077 | } |
michael@0 | 2078 | } |
michael@0 | 2079 | return TRUE; |
michael@0 | 2080 | } |
michael@0 | 2081 | |
michael@0 | 2082 | /** |
michael@0 | 2083 | * Checks and sets the match information if found. |
michael@0 | 2084 | * Checks |
michael@0 | 2085 | * <ul> |
michael@0 | 2086 | * <li> the current match does not repeat the last match |
michael@0 | 2087 | * <li> boundaries are correct |
michael@0 | 2088 | * <li> exact matches has no extra accents |
michael@0 | 2089 | * <li> identical matches |
michael@0 | 2090 | * <\ul> |
michael@0 | 2091 | * Otherwise the offset will be shifted to the preceding character. |
michael@0 | 2092 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2093 | * before calling this method. |
michael@0 | 2094 | * @param strsrch string search data |
michael@0 | 2095 | * @param collator |
michael@0 | 2096 | * @param coleiter collation element iterator |
michael@0 | 2097 | * @param text string |
michael@0 | 2098 | * @param textoffset offset in the collation element text. the returned value |
michael@0 | 2099 | * will be the truncated start offset of the match or the new start |
michael@0 | 2100 | * search offset. |
michael@0 | 2101 | * @param status output error status if any |
michael@0 | 2102 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 2103 | */ |
michael@0 | 2104 | static |
michael@0 | 2105 | inline UBool checkPreviousExactMatch(UStringSearch *strsrch, |
michael@0 | 2106 | int32_t *textoffset, |
michael@0 | 2107 | UErrorCode *status) |
michael@0 | 2108 | { |
michael@0 | 2109 | // to ensure that the start and ends are not composite characters |
michael@0 | 2110 | int32_t end = ucol_getOffset(strsrch->textIter); |
michael@0 | 2111 | if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) |
michael@0 | 2112 | || U_FAILURE(*status)) { |
michael@0 | 2113 | return FALSE; |
michael@0 | 2114 | } |
michael@0 | 2115 | |
michael@0 | 2116 | // this totally matches, however we need to check if it is repeating |
michael@0 | 2117 | // the old match |
michael@0 | 2118 | if (checkRepeatedMatch(strsrch, *textoffset, end) || |
michael@0 | 2119 | !isBreakUnit(strsrch, *textoffset, end) || |
michael@0 | 2120 | hasAccentsBeforeMatch(strsrch, *textoffset, end) || |
michael@0 | 2121 | !checkIdentical(strsrch, *textoffset, end) || |
michael@0 | 2122 | hasAccentsAfterMatch(strsrch, *textoffset, end)) { |
michael@0 | 2123 | (*textoffset) --; |
michael@0 | 2124 | *textoffset = getPreviousBaseOffset(strsrch->search->text, |
michael@0 | 2125 | *textoffset); |
michael@0 | 2126 | return FALSE; |
michael@0 | 2127 | } |
michael@0 | 2128 | |
michael@0 | 2129 | //Add breakiterator boundary check for primary strength search. |
michael@0 | 2130 | if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { |
michael@0 | 2131 | checkBreakBoundary(strsrch, textoffset, &end); |
michael@0 | 2132 | } |
michael@0 | 2133 | |
michael@0 | 2134 | strsrch->search->matchedIndex = *textoffset; |
michael@0 | 2135 | strsrch->search->matchedLength = end - *textoffset; |
michael@0 | 2136 | return TRUE; |
michael@0 | 2137 | } |
michael@0 | 2138 | |
michael@0 | 2139 | /** |
michael@0 | 2140 | * Rearranges the end accents to try matching. |
michael@0 | 2141 | * Suffix accents in the text will be grouped according to their combining |
michael@0 | 2142 | * class and the groups will be mixed and matched to try find the perfect |
michael@0 | 2143 | * match with the pattern. |
michael@0 | 2144 | * So for instance looking for "\u0301" in "\u030A\u0301\u0325" |
michael@0 | 2145 | * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings |
michael@0 | 2146 | * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", |
michael@0 | 2147 | * "\u0301\u0325". |
michael@0 | 2148 | * step 2: check if any of the generated substrings matches the pattern. |
michael@0 | 2149 | * Internal method, status assumed to be success, user has to check status |
michael@0 | 2150 | * before calling this method. |
michael@0 | 2151 | * @param strsrch string search match |
michael@0 | 2152 | * @param start offset of the first base character |
michael@0 | 2153 | * @param end start of the last accent set |
michael@0 | 2154 | * @param status only error status if any |
michael@0 | 2155 | * @return USEARCH_DONE if a match is not found, otherwise return the ending |
michael@0 | 2156 | * offset of the match. Note this start includes all following accents. |
michael@0 | 2157 | */ |
michael@0 | 2158 | static |
michael@0 | 2159 | int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, |
michael@0 | 2160 | int32_t start, |
michael@0 | 2161 | int32_t end, |
michael@0 | 2162 | UErrorCode *status) |
michael@0 | 2163 | { |
michael@0 | 2164 | const UChar *text = strsrch->search->text; |
michael@0 | 2165 | int32_t tempend = end; |
michael@0 | 2166 | |
michael@0 | 2167 | U16_BACK_1(text, 0, tempend); |
michael@0 | 2168 | if (!(getFCD(text, &tempend, strsrch->search->textLength) & |
michael@0 | 2169 | LAST_BYTE_MASK_)) { |
michael@0 | 2170 | // die... failed at a base character |
michael@0 | 2171 | return USEARCH_DONE; |
michael@0 | 2172 | } |
michael@0 | 2173 | end = getNextBaseOffset(text, end, strsrch->search->textLength); |
michael@0 | 2174 | |
michael@0 | 2175 | if (U_SUCCESS(*status)) { |
michael@0 | 2176 | UChar accents[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2177 | int32_t offset = getPreviousBaseOffset(text, end); |
michael@0 | 2178 | // normalizing the offensive string |
michael@0 | 2179 | unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, |
michael@0 | 2180 | INITIAL_ARRAY_SIZE_, status); |
michael@0 | 2181 | |
michael@0 | 2182 | int32_t accentsindex[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2183 | int32_t accentsize = getUnblockedAccentIndex(accents, |
michael@0 | 2184 | accentsindex); |
michael@0 | 2185 | int32_t count = (2 << (accentsize - 1)) - 1; |
michael@0 | 2186 | UChar buffer[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2187 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 2188 | while (U_SUCCESS(*status) && count > 0) { |
michael@0 | 2189 | UChar *rearrange = strsrch->canonicalSuffixAccents; |
michael@0 | 2190 | // copy the base characters |
michael@0 | 2191 | for (int k = 0; k < accentsindex[0]; k ++) { |
michael@0 | 2192 | *rearrange ++ = accents[k]; |
michael@0 | 2193 | } |
michael@0 | 2194 | // forming all possible canonical rearrangement by dropping |
michael@0 | 2195 | // sets of accents |
michael@0 | 2196 | for (int i = 0; i <= accentsize - 1; i ++) { |
michael@0 | 2197 | int32_t mask = 1 << (accentsize - i - 1); |
michael@0 | 2198 | if (count & mask) { |
michael@0 | 2199 | for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { |
michael@0 | 2200 | *rearrange ++ = accents[j]; |
michael@0 | 2201 | } |
michael@0 | 2202 | } |
michael@0 | 2203 | } |
michael@0 | 2204 | *rearrange = 0; |
michael@0 | 2205 | int32_t matchsize = INITIAL_ARRAY_SIZE_; |
michael@0 | 2206 | UChar *match = addToUCharArray(buffer, &matchsize, |
michael@0 | 2207 | strsrch->canonicalPrefixAccents, |
michael@0 | 2208 | strsrch->search->text + start, |
michael@0 | 2209 | offset - start, |
michael@0 | 2210 | strsrch->canonicalSuffixAccents, |
michael@0 | 2211 | status); |
michael@0 | 2212 | |
michael@0 | 2213 | // run the collator iterator through this match |
michael@0 | 2214 | // if status is a failure ucol_setText does nothing |
michael@0 | 2215 | ucol_setText(coleiter, match, matchsize, status); |
michael@0 | 2216 | if (U_SUCCESS(*status)) { |
michael@0 | 2217 | if (checkCollationMatch(strsrch, coleiter)) { |
michael@0 | 2218 | if (match != buffer) { |
michael@0 | 2219 | uprv_free(match); |
michael@0 | 2220 | } |
michael@0 | 2221 | return end; |
michael@0 | 2222 | } |
michael@0 | 2223 | } |
michael@0 | 2224 | count --; |
michael@0 | 2225 | } |
michael@0 | 2226 | } |
michael@0 | 2227 | return USEARCH_DONE; |
michael@0 | 2228 | } |
michael@0 | 2229 | |
michael@0 | 2230 | /** |
michael@0 | 2231 | * Take the rearranged start accents and tries matching. If match failed at |
michael@0 | 2232 | * a seperate following set of accents (seperated from the rearranged on by |
michael@0 | 2233 | * at least a base character) then we rearrange the preceding accents and |
michael@0 | 2234 | * tries matching again. |
michael@0 | 2235 | * We allow skipping of the ends of the accent set if the ces do not match. |
michael@0 | 2236 | * However if the failure is found before the accent set, it fails. |
michael@0 | 2237 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2238 | * before calling this method. |
michael@0 | 2239 | * @param strsrch string search data |
michael@0 | 2240 | * @param textoffset of the ends of the rearranged accent |
michael@0 | 2241 | * @param status output error status if any |
michael@0 | 2242 | * @return USEARCH_DONE if a match is not found, otherwise return the ending |
michael@0 | 2243 | * offset of the match. Note this start includes all following accents. |
michael@0 | 2244 | */ |
michael@0 | 2245 | static |
michael@0 | 2246 | int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, |
michael@0 | 2247 | int32_t textoffset, |
michael@0 | 2248 | UErrorCode *status) |
michael@0 | 2249 | { |
michael@0 | 2250 | const UChar *text = strsrch->search->text; |
michael@0 | 2251 | const UCollator *collator = strsrch->collator; |
michael@0 | 2252 | int32_t safelength = 0; |
michael@0 | 2253 | UChar *safetext; |
michael@0 | 2254 | int32_t safetextlength; |
michael@0 | 2255 | UChar safebuffer[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2256 | int32_t safeoffset = textoffset; |
michael@0 | 2257 | |
michael@0 | 2258 | if (textoffset && |
michael@0 | 2259 | ucol_unsafeCP(strsrch->canonicalPrefixAccents[ |
michael@0 | 2260 | u_strlen(strsrch->canonicalPrefixAccents) - 1 |
michael@0 | 2261 | ], collator)) { |
michael@0 | 2262 | safeoffset = getNextSafeOffset(collator, text, textoffset, |
michael@0 | 2263 | strsrch->search->textLength); |
michael@0 | 2264 | safelength = safeoffset - textoffset; |
michael@0 | 2265 | safetextlength = INITIAL_ARRAY_SIZE_; |
michael@0 | 2266 | safetext = addToUCharArray(safebuffer, &safetextlength, |
michael@0 | 2267 | strsrch->canonicalPrefixAccents, |
michael@0 | 2268 | text + textoffset, safelength, |
michael@0 | 2269 | NULL, status); |
michael@0 | 2270 | } |
michael@0 | 2271 | else { |
michael@0 | 2272 | safetextlength = u_strlen(strsrch->canonicalPrefixAccents); |
michael@0 | 2273 | safetext = strsrch->canonicalPrefixAccents; |
michael@0 | 2274 | } |
michael@0 | 2275 | |
michael@0 | 2276 | UCollationElements *coleiter = strsrch->utilIter; |
michael@0 | 2277 | // if status is a failure, ucol_setText does nothing |
michael@0 | 2278 | ucol_setText(coleiter, safetext, safetextlength, status); |
michael@0 | 2279 | // status checked in loop below |
michael@0 | 2280 | |
michael@0 | 2281 | int32_t *ce = strsrch->pattern.CE; |
michael@0 | 2282 | int32_t celength = strsrch->pattern.CELength; |
michael@0 | 2283 | int ceindex = 0; |
michael@0 | 2284 | UBool isSafe = TRUE; // safe zone indication flag for position |
michael@0 | 2285 | int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); |
michael@0 | 2286 | |
michael@0 | 2287 | while (ceindex < celength) { |
michael@0 | 2288 | int32_t textce = ucol_next(coleiter, status); |
michael@0 | 2289 | if (U_FAILURE(*status)) { |
michael@0 | 2290 | if (isSafe) { |
michael@0 | 2291 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2292 | } |
michael@0 | 2293 | return USEARCH_DONE; |
michael@0 | 2294 | } |
michael@0 | 2295 | if (textce == UCOL_NULLORDER) { |
michael@0 | 2296 | // check if we have passed the safe buffer |
michael@0 | 2297 | if (coleiter == strsrch->textIter) { |
michael@0 | 2298 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2299 | return USEARCH_DONE; |
michael@0 | 2300 | } |
michael@0 | 2301 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2302 | safetext = safebuffer; |
michael@0 | 2303 | coleiter = strsrch->textIter; |
michael@0 | 2304 | setColEIterOffset(coleiter, safeoffset); |
michael@0 | 2305 | // status checked at the start of the loop |
michael@0 | 2306 | isSafe = FALSE; |
michael@0 | 2307 | continue; |
michael@0 | 2308 | } |
michael@0 | 2309 | textce = getCE(strsrch, textce); |
michael@0 | 2310 | if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { |
michael@0 | 2311 | // do the beginning stuff |
michael@0 | 2312 | int32_t failedoffset = ucol_getOffset(coleiter); |
michael@0 | 2313 | if (isSafe && failedoffset <= prefixlength) { |
michael@0 | 2314 | // alas... no hope. failed at rearranged accent set |
michael@0 | 2315 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2316 | return USEARCH_DONE; |
michael@0 | 2317 | } |
michael@0 | 2318 | else { |
michael@0 | 2319 | if (isSafe) { |
michael@0 | 2320 | failedoffset = safeoffset - failedoffset; |
michael@0 | 2321 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2322 | } |
michael@0 | 2323 | |
michael@0 | 2324 | // try rearranging the end accents |
michael@0 | 2325 | int32_t result = doPreviousCanonicalSuffixMatch(strsrch, |
michael@0 | 2326 | textoffset, failedoffset, status); |
michael@0 | 2327 | if (result != USEARCH_DONE) { |
michael@0 | 2328 | // if status is a failure, ucol_setOffset does nothing |
michael@0 | 2329 | setColEIterOffset(strsrch->textIter, result); |
michael@0 | 2330 | } |
michael@0 | 2331 | if (U_FAILURE(*status)) { |
michael@0 | 2332 | return USEARCH_DONE; |
michael@0 | 2333 | } |
michael@0 | 2334 | return result; |
michael@0 | 2335 | } |
michael@0 | 2336 | } |
michael@0 | 2337 | if (textce == ce[ceindex]) { |
michael@0 | 2338 | ceindex ++; |
michael@0 | 2339 | } |
michael@0 | 2340 | } |
michael@0 | 2341 | // set offset here |
michael@0 | 2342 | if (isSafe) { |
michael@0 | 2343 | int32_t result = ucol_getOffset(coleiter); |
michael@0 | 2344 | // sets the text iterator here with the correct expansion and offset |
michael@0 | 2345 | int32_t leftoverces = getExpansionSuffix(coleiter); |
michael@0 | 2346 | cleanUpSafeText(strsrch, safetext, safebuffer); |
michael@0 | 2347 | if (result <= prefixlength) { |
michael@0 | 2348 | result = textoffset; |
michael@0 | 2349 | } |
michael@0 | 2350 | else { |
michael@0 | 2351 | result = textoffset + (safeoffset - result); |
michael@0 | 2352 | } |
michael@0 | 2353 | setColEIterOffset(strsrch->textIter, result); |
michael@0 | 2354 | setExpansionSuffix(strsrch->textIter, leftoverces); |
michael@0 | 2355 | return result; |
michael@0 | 2356 | } |
michael@0 | 2357 | |
michael@0 | 2358 | return ucol_getOffset(coleiter); |
michael@0 | 2359 | } |
michael@0 | 2360 | |
michael@0 | 2361 | /** |
michael@0 | 2362 | * Trying out the substring and sees if it can be a canonical match. |
michael@0 | 2363 | * This will try normalizing the starting accents and arranging them into |
michael@0 | 2364 | * canonical equivalents and check their corresponding ces with the pattern ce. |
michael@0 | 2365 | * Prefix accents in the text will be grouped according to their combining |
michael@0 | 2366 | * class and the groups will be mixed and matched to try find the perfect |
michael@0 | 2367 | * match with the pattern. |
michael@0 | 2368 | * So for instance looking for "\u0301" in "\u030A\u0301\u0325" |
michael@0 | 2369 | * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings |
michael@0 | 2370 | * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", |
michael@0 | 2371 | * "\u0301\u0325". |
michael@0 | 2372 | * step 2: check if any of the generated substrings matches the pattern. |
michael@0 | 2373 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2374 | * before calling this method. |
michael@0 | 2375 | * @param strsrch string search data |
michael@0 | 2376 | * @param textoffset start offset in the collation element text that starts |
michael@0 | 2377 | * with the accents to be rearranged |
michael@0 | 2378 | * @param status output error status if any |
michael@0 | 2379 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 2380 | */ |
michael@0 | 2381 | static |
michael@0 | 2382 | UBool doPreviousCanonicalMatch(UStringSearch *strsrch, |
michael@0 | 2383 | int32_t textoffset, |
michael@0 | 2384 | UErrorCode *status) |
michael@0 | 2385 | { |
michael@0 | 2386 | const UChar *text = strsrch->search->text; |
michael@0 | 2387 | int32_t temp = textoffset; |
michael@0 | 2388 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 2389 | if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { |
michael@0 | 2390 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 2391 | int32_t offset = ucol_getOffset(coleiter); |
michael@0 | 2392 | if (strsrch->pattern.hasSuffixAccents) { |
michael@0 | 2393 | offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, |
michael@0 | 2394 | offset, status); |
michael@0 | 2395 | if (U_SUCCESS(*status) && offset != USEARCH_DONE) { |
michael@0 | 2396 | setColEIterOffset(coleiter, offset); |
michael@0 | 2397 | return TRUE; |
michael@0 | 2398 | } |
michael@0 | 2399 | } |
michael@0 | 2400 | return FALSE; |
michael@0 | 2401 | } |
michael@0 | 2402 | |
michael@0 | 2403 | if (!strsrch->pattern.hasPrefixAccents) { |
michael@0 | 2404 | return FALSE; |
michael@0 | 2405 | } |
michael@0 | 2406 | |
michael@0 | 2407 | UChar accents[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2408 | // offset to the last base character in substring to search |
michael@0 | 2409 | int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); |
michael@0 | 2410 | // normalizing the offensive string |
michael@0 | 2411 | unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, |
michael@0 | 2412 | 0, accents, INITIAL_ARRAY_SIZE_, status); |
michael@0 | 2413 | // status checked in loop |
michael@0 | 2414 | |
michael@0 | 2415 | int32_t accentsindex[INITIAL_ARRAY_SIZE_]; |
michael@0 | 2416 | int32_t size = getUnblockedAccentIndex(accents, accentsindex); |
michael@0 | 2417 | |
michael@0 | 2418 | // 2 power n - 1 plus the full set of accents |
michael@0 | 2419 | int32_t count = (2 << (size - 1)) - 1; |
michael@0 | 2420 | while (U_SUCCESS(*status) && count > 0) { |
michael@0 | 2421 | UChar *rearrange = strsrch->canonicalPrefixAccents; |
michael@0 | 2422 | // copy the base characters |
michael@0 | 2423 | for (int k = 0; k < accentsindex[0]; k ++) { |
michael@0 | 2424 | *rearrange ++ = accents[k]; |
michael@0 | 2425 | } |
michael@0 | 2426 | // forming all possible canonical rearrangement by dropping |
michael@0 | 2427 | // sets of accents |
michael@0 | 2428 | for (int i = 0; i <= size - 1; i ++) { |
michael@0 | 2429 | int32_t mask = 1 << (size - i - 1); |
michael@0 | 2430 | if (count & mask) { |
michael@0 | 2431 | for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { |
michael@0 | 2432 | *rearrange ++ = accents[j]; |
michael@0 | 2433 | } |
michael@0 | 2434 | } |
michael@0 | 2435 | } |
michael@0 | 2436 | *rearrange = 0; |
michael@0 | 2437 | int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, |
michael@0 | 2438 | baseoffset, status); |
michael@0 | 2439 | if (offset != USEARCH_DONE) { |
michael@0 | 2440 | return TRUE; // match found |
michael@0 | 2441 | } |
michael@0 | 2442 | count --; |
michael@0 | 2443 | } |
michael@0 | 2444 | return FALSE; |
michael@0 | 2445 | } |
michael@0 | 2446 | |
michael@0 | 2447 | /** |
michael@0 | 2448 | * Checks match for contraction. |
michael@0 | 2449 | * If the match starts with a partial contraction we fail. |
michael@0 | 2450 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2451 | * before calling this method. |
michael@0 | 2452 | * @param strsrch string search data |
michael@0 | 2453 | * @param start offset of potential match, to be modified if necessary |
michael@0 | 2454 | * @param end offset of potential match, to be modified if necessary |
michael@0 | 2455 | * @param status only error status if any |
michael@0 | 2456 | * @return TRUE if match passes the contraction test, FALSE otherwise |
michael@0 | 2457 | */ |
michael@0 | 2458 | static |
michael@0 | 2459 | UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, |
michael@0 | 2460 | int32_t *start, |
michael@0 | 2461 | int32_t *end, UErrorCode *status) |
michael@0 | 2462 | { |
michael@0 | 2463 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 2464 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 2465 | int32_t temp = *end; |
michael@0 | 2466 | const UCollator *collator = strsrch->collator; |
michael@0 | 2467 | const UChar *text = strsrch->search->text; |
michael@0 | 2468 | // This part checks if either if the start of the match contains potential |
michael@0 | 2469 | // contraction. If so we'll have to iterate through them |
michael@0 | 2470 | // Since we used ucol_next while previously looking for the potential |
michael@0 | 2471 | // match, this guarantees that our end will not be a partial contraction, |
michael@0 | 2472 | // or a partial supplementary character. |
michael@0 | 2473 | if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { |
michael@0 | 2474 | int32_t expansion = getExpansionSuffix(coleiter); |
michael@0 | 2475 | UBool expandflag = expansion > 0; |
michael@0 | 2476 | setColEIterOffset(coleiter, *end); |
michael@0 | 2477 | while (expansion > 0) { |
michael@0 | 2478 | // getting rid of the redundant ce |
michael@0 | 2479 | // since forward contraction/expansion may have extra ces |
michael@0 | 2480 | // if we are in the normalization buffer, hasAccentsBeforeMatch |
michael@0 | 2481 | // would have taken care of it. |
michael@0 | 2482 | // E.g. the character \u01FA will have an expansion of 3, but if |
michael@0 | 2483 | // we are only looking for A ring A\u030A, we'll have to skip the |
michael@0 | 2484 | // last ce in the expansion buffer |
michael@0 | 2485 | ucol_previous(coleiter, status); |
michael@0 | 2486 | if (U_FAILURE(*status)) { |
michael@0 | 2487 | return FALSE; |
michael@0 | 2488 | } |
michael@0 | 2489 | if (ucol_getOffset(coleiter) != temp) { |
michael@0 | 2490 | *end = temp; |
michael@0 | 2491 | temp = ucol_getOffset(coleiter); |
michael@0 | 2492 | } |
michael@0 | 2493 | expansion --; |
michael@0 | 2494 | } |
michael@0 | 2495 | |
michael@0 | 2496 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 2497 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 2498 | int32_t count = patterncelength; |
michael@0 | 2499 | while (count > 0) { |
michael@0 | 2500 | int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); |
michael@0 | 2501 | // status checked below, note that if status is a failure |
michael@0 | 2502 | // ucol_previous returns UCOL_NULLORDER |
michael@0 | 2503 | if (ce == UCOL_IGNORABLE) { |
michael@0 | 2504 | continue; |
michael@0 | 2505 | } |
michael@0 | 2506 | if (expandflag && count == 0 && |
michael@0 | 2507 | getColElemIterOffset(coleiter, FALSE) != temp) { |
michael@0 | 2508 | *end = temp; |
michael@0 | 2509 | temp = ucol_getOffset(coleiter); |
michael@0 | 2510 | } |
michael@0 | 2511 | if (count == patterncelength && |
michael@0 | 2512 | ce != patternce[patterncelength - 1]) { |
michael@0 | 2513 | // accents may have extra starting ces, this occurs when a |
michael@0 | 2514 | // pure accent pattern is matched without rearrangement |
michael@0 | 2515 | int32_t expected = patternce[patterncelength - 1]; |
michael@0 | 2516 | U16_BACK_1(text, 0, *end); |
michael@0 | 2517 | if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { |
michael@0 | 2518 | ce = getCE(strsrch, ucol_previous(coleiter, status)); |
michael@0 | 2519 | while (U_SUCCESS(*status) && ce != expected && |
michael@0 | 2520 | ce != UCOL_NULLORDER && |
michael@0 | 2521 | ucol_getOffset(coleiter) <= *start) { |
michael@0 | 2522 | ce = getCE(strsrch, ucol_previous(coleiter, status)); |
michael@0 | 2523 | } |
michael@0 | 2524 | } |
michael@0 | 2525 | } |
michael@0 | 2526 | if (U_FAILURE(*status) || ce != patternce[count - 1]) { |
michael@0 | 2527 | (*start) --; |
michael@0 | 2528 | *start = getPreviousBaseOffset(text, *start); |
michael@0 | 2529 | return FALSE; |
michael@0 | 2530 | } |
michael@0 | 2531 | count --; |
michael@0 | 2532 | } |
michael@0 | 2533 | } |
michael@0 | 2534 | return TRUE; |
michael@0 | 2535 | } |
michael@0 | 2536 | |
michael@0 | 2537 | /** |
michael@0 | 2538 | * Checks and sets the match information if found. |
michael@0 | 2539 | * Checks |
michael@0 | 2540 | * <ul> |
michael@0 | 2541 | * <li> the potential match does not repeat the previous match |
michael@0 | 2542 | * <li> boundaries are correct |
michael@0 | 2543 | * <li> potential match does not end in the middle of a contraction |
michael@0 | 2544 | * <li> identical matches |
michael@0 | 2545 | * <\ul> |
michael@0 | 2546 | * Otherwise the offset will be shifted to the next character. |
michael@0 | 2547 | * Internal method, status assumed to be success, caller has to check status |
michael@0 | 2548 | * before calling this method. |
michael@0 | 2549 | * @param strsrch string search data |
michael@0 | 2550 | * @param textoffset offset in the collation element text. the returned value |
michael@0 | 2551 | * will be the truncated start offset of the match or the new start |
michael@0 | 2552 | * search offset. |
michael@0 | 2553 | * @param status only error status if any |
michael@0 | 2554 | * @return TRUE if the match is valid, FALSE otherwise |
michael@0 | 2555 | */ |
michael@0 | 2556 | static |
michael@0 | 2557 | inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, |
michael@0 | 2558 | int32_t *textoffset, |
michael@0 | 2559 | UErrorCode *status) |
michael@0 | 2560 | { |
michael@0 | 2561 | // to ensure that the start and ends are not composite characters |
michael@0 | 2562 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 2563 | // if we have a canonical accent match |
michael@0 | 2564 | if ((strsrch->pattern.hasSuffixAccents && |
michael@0 | 2565 | strsrch->canonicalSuffixAccents[0]) || |
michael@0 | 2566 | (strsrch->pattern.hasPrefixAccents && |
michael@0 | 2567 | strsrch->canonicalPrefixAccents[0])) { |
michael@0 | 2568 | strsrch->search->matchedIndex = *textoffset; |
michael@0 | 2569 | strsrch->search->matchedLength = |
michael@0 | 2570 | getNextUStringSearchBaseOffset(strsrch, |
michael@0 | 2571 | getColElemIterOffset(coleiter, FALSE)) |
michael@0 | 2572 | - *textoffset; |
michael@0 | 2573 | return TRUE; |
michael@0 | 2574 | } |
michael@0 | 2575 | |
michael@0 | 2576 | int32_t end = ucol_getOffset(coleiter); |
michael@0 | 2577 | if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, |
michael@0 | 2578 | status) || |
michael@0 | 2579 | U_FAILURE(*status)) { |
michael@0 | 2580 | return FALSE; |
michael@0 | 2581 | } |
michael@0 | 2582 | |
michael@0 | 2583 | end = getNextUStringSearchBaseOffset(strsrch, end); |
michael@0 | 2584 | // this totally matches, however we need to check if it is repeating |
michael@0 | 2585 | if (checkRepeatedMatch(strsrch, *textoffset, end) || |
michael@0 | 2586 | !isBreakUnit(strsrch, *textoffset, end) || |
michael@0 | 2587 | !checkIdentical(strsrch, *textoffset, end)) { |
michael@0 | 2588 | (*textoffset) --; |
michael@0 | 2589 | *textoffset = getPreviousBaseOffset(strsrch->search->text, |
michael@0 | 2590 | *textoffset); |
michael@0 | 2591 | return FALSE; |
michael@0 | 2592 | } |
michael@0 | 2593 | |
michael@0 | 2594 | strsrch->search->matchedIndex = *textoffset; |
michael@0 | 2595 | strsrch->search->matchedLength = end - *textoffset; |
michael@0 | 2596 | return TRUE; |
michael@0 | 2597 | } |
michael@0 | 2598 | #endif // #if BOYER_MOORE |
michael@0 | 2599 | |
michael@0 | 2600 | // constructors and destructor ------------------------------------------- |
michael@0 | 2601 | |
michael@0 | 2602 | U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, |
michael@0 | 2603 | int32_t patternlength, |
michael@0 | 2604 | const UChar *text, |
michael@0 | 2605 | int32_t textlength, |
michael@0 | 2606 | const char *locale, |
michael@0 | 2607 | UBreakIterator *breakiter, |
michael@0 | 2608 | UErrorCode *status) |
michael@0 | 2609 | { |
michael@0 | 2610 | if (U_FAILURE(*status)) { |
michael@0 | 2611 | return NULL; |
michael@0 | 2612 | } |
michael@0 | 2613 | #if UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2614 | if (breakiter != NULL) { |
michael@0 | 2615 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 2616 | return NULL; |
michael@0 | 2617 | } |
michael@0 | 2618 | #endif |
michael@0 | 2619 | if (locale) { |
michael@0 | 2620 | // ucol_open internally checks for status |
michael@0 | 2621 | UCollator *collator = ucol_open(locale, status); |
michael@0 | 2622 | // pattern, text checks are done in usearch_openFromCollator |
michael@0 | 2623 | UStringSearch *result = usearch_openFromCollator(pattern, |
michael@0 | 2624 | patternlength, text, textlength, |
michael@0 | 2625 | collator, breakiter, status); |
michael@0 | 2626 | |
michael@0 | 2627 | if (result == NULL || U_FAILURE(*status)) { |
michael@0 | 2628 | if (collator) { |
michael@0 | 2629 | ucol_close(collator); |
michael@0 | 2630 | } |
michael@0 | 2631 | return NULL; |
michael@0 | 2632 | } |
michael@0 | 2633 | else { |
michael@0 | 2634 | result->ownCollator = TRUE; |
michael@0 | 2635 | } |
michael@0 | 2636 | return result; |
michael@0 | 2637 | } |
michael@0 | 2638 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2639 | return NULL; |
michael@0 | 2640 | } |
michael@0 | 2641 | |
michael@0 | 2642 | U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( |
michael@0 | 2643 | const UChar *pattern, |
michael@0 | 2644 | int32_t patternlength, |
michael@0 | 2645 | const UChar *text, |
michael@0 | 2646 | int32_t textlength, |
michael@0 | 2647 | const UCollator *collator, |
michael@0 | 2648 | UBreakIterator *breakiter, |
michael@0 | 2649 | UErrorCode *status) |
michael@0 | 2650 | { |
michael@0 | 2651 | if (U_FAILURE(*status)) { |
michael@0 | 2652 | return NULL; |
michael@0 | 2653 | } |
michael@0 | 2654 | #if UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2655 | if (breakiter != NULL) { |
michael@0 | 2656 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 2657 | return NULL; |
michael@0 | 2658 | } |
michael@0 | 2659 | #endif |
michael@0 | 2660 | if (pattern == NULL || text == NULL || collator == NULL) { |
michael@0 | 2661 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2662 | return NULL; |
michael@0 | 2663 | } |
michael@0 | 2664 | |
michael@0 | 2665 | // string search does not really work when numeric collation is turned on |
michael@0 | 2666 | if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { |
michael@0 | 2667 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 2668 | return NULL; |
michael@0 | 2669 | } |
michael@0 | 2670 | |
michael@0 | 2671 | if (U_SUCCESS(*status)) { |
michael@0 | 2672 | initializeFCD(status); |
michael@0 | 2673 | if (U_FAILURE(*status)) { |
michael@0 | 2674 | return NULL; |
michael@0 | 2675 | } |
michael@0 | 2676 | |
michael@0 | 2677 | UStringSearch *result; |
michael@0 | 2678 | if (textlength == -1) { |
michael@0 | 2679 | textlength = u_strlen(text); |
michael@0 | 2680 | } |
michael@0 | 2681 | if (patternlength == -1) { |
michael@0 | 2682 | patternlength = u_strlen(pattern); |
michael@0 | 2683 | } |
michael@0 | 2684 | if (textlength <= 0 || patternlength <= 0) { |
michael@0 | 2685 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2686 | return NULL; |
michael@0 | 2687 | } |
michael@0 | 2688 | |
michael@0 | 2689 | result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); |
michael@0 | 2690 | if (result == NULL) { |
michael@0 | 2691 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2692 | return NULL; |
michael@0 | 2693 | } |
michael@0 | 2694 | |
michael@0 | 2695 | result->collator = collator; |
michael@0 | 2696 | result->strength = ucol_getStrength(collator); |
michael@0 | 2697 | result->ceMask = getMask(result->strength); |
michael@0 | 2698 | result->toShift = |
michael@0 | 2699 | ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == |
michael@0 | 2700 | UCOL_SHIFTED; |
michael@0 | 2701 | result->variableTop = ucol_getVariableTop(collator, status); |
michael@0 | 2702 | |
michael@0 | 2703 | result->nfd = Normalizer2Factory::getNFDInstance(*status); |
michael@0 | 2704 | |
michael@0 | 2705 | if (U_FAILURE(*status)) { |
michael@0 | 2706 | uprv_free(result); |
michael@0 | 2707 | return NULL; |
michael@0 | 2708 | } |
michael@0 | 2709 | |
michael@0 | 2710 | result->search = (USearch *)uprv_malloc(sizeof(USearch)); |
michael@0 | 2711 | if (result->search == NULL) { |
michael@0 | 2712 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2713 | uprv_free(result); |
michael@0 | 2714 | return NULL; |
michael@0 | 2715 | } |
michael@0 | 2716 | |
michael@0 | 2717 | result->search->text = text; |
michael@0 | 2718 | result->search->textLength = textlength; |
michael@0 | 2719 | |
michael@0 | 2720 | result->pattern.text = pattern; |
michael@0 | 2721 | result->pattern.textLength = patternlength; |
michael@0 | 2722 | result->pattern.CE = NULL; |
michael@0 | 2723 | result->pattern.PCE = NULL; |
michael@0 | 2724 | |
michael@0 | 2725 | result->search->breakIter = breakiter; |
michael@0 | 2726 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2727 | result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); |
michael@0 | 2728 | if (breakiter) { |
michael@0 | 2729 | ubrk_setText(breakiter, text, textlength, status); |
michael@0 | 2730 | } |
michael@0 | 2731 | #endif |
michael@0 | 2732 | |
michael@0 | 2733 | result->ownCollator = FALSE; |
michael@0 | 2734 | result->search->matchedLength = 0; |
michael@0 | 2735 | result->search->matchedIndex = USEARCH_DONE; |
michael@0 | 2736 | result->utilIter = NULL; |
michael@0 | 2737 | result->textIter = ucol_openElements(collator, text, |
michael@0 | 2738 | textlength, status); |
michael@0 | 2739 | if (U_FAILURE(*status)) { |
michael@0 | 2740 | usearch_close(result); |
michael@0 | 2741 | return NULL; |
michael@0 | 2742 | } |
michael@0 | 2743 | |
michael@0 | 2744 | result->search->isOverlap = FALSE; |
michael@0 | 2745 | result->search->isCanonicalMatch = FALSE; |
michael@0 | 2746 | result->search->elementComparisonType = 0; |
michael@0 | 2747 | result->search->isForwardSearching = TRUE; |
michael@0 | 2748 | result->search->reset = TRUE; |
michael@0 | 2749 | |
michael@0 | 2750 | initialize(result, status); |
michael@0 | 2751 | |
michael@0 | 2752 | if (U_FAILURE(*status)) { |
michael@0 | 2753 | usearch_close(result); |
michael@0 | 2754 | return NULL; |
michael@0 | 2755 | } |
michael@0 | 2756 | |
michael@0 | 2757 | return result; |
michael@0 | 2758 | } |
michael@0 | 2759 | return NULL; |
michael@0 | 2760 | } |
michael@0 | 2761 | |
michael@0 | 2762 | U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) |
michael@0 | 2763 | { |
michael@0 | 2764 | if (strsrch) { |
michael@0 | 2765 | if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && |
michael@0 | 2766 | strsrch->pattern.CE) { |
michael@0 | 2767 | uprv_free(strsrch->pattern.CE); |
michael@0 | 2768 | } |
michael@0 | 2769 | |
michael@0 | 2770 | if (strsrch->pattern.PCE != NULL && |
michael@0 | 2771 | strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { |
michael@0 | 2772 | uprv_free(strsrch->pattern.PCE); |
michael@0 | 2773 | } |
michael@0 | 2774 | |
michael@0 | 2775 | ucol_closeElements(strsrch->textIter); |
michael@0 | 2776 | ucol_closeElements(strsrch->utilIter); |
michael@0 | 2777 | |
michael@0 | 2778 | if (strsrch->ownCollator && strsrch->collator) { |
michael@0 | 2779 | ucol_close((UCollator *)strsrch->collator); |
michael@0 | 2780 | } |
michael@0 | 2781 | |
michael@0 | 2782 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2783 | if (strsrch->search->internalBreakIter) { |
michael@0 | 2784 | ubrk_close(strsrch->search->internalBreakIter); |
michael@0 | 2785 | } |
michael@0 | 2786 | #endif |
michael@0 | 2787 | |
michael@0 | 2788 | uprv_free(strsrch->search); |
michael@0 | 2789 | uprv_free(strsrch); |
michael@0 | 2790 | } |
michael@0 | 2791 | } |
michael@0 | 2792 | |
michael@0 | 2793 | // set and get methods -------------------------------------------------- |
michael@0 | 2794 | |
michael@0 | 2795 | U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, |
michael@0 | 2796 | int32_t position, |
michael@0 | 2797 | UErrorCode *status) |
michael@0 | 2798 | { |
michael@0 | 2799 | if (U_SUCCESS(*status) && strsrch) { |
michael@0 | 2800 | if (isOutOfBounds(strsrch->search->textLength, position)) { |
michael@0 | 2801 | *status = U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 2802 | } |
michael@0 | 2803 | else { |
michael@0 | 2804 | setColEIterOffset(strsrch->textIter, position); |
michael@0 | 2805 | } |
michael@0 | 2806 | strsrch->search->matchedIndex = USEARCH_DONE; |
michael@0 | 2807 | strsrch->search->matchedLength = 0; |
michael@0 | 2808 | strsrch->search->reset = FALSE; |
michael@0 | 2809 | } |
michael@0 | 2810 | } |
michael@0 | 2811 | |
michael@0 | 2812 | U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) |
michael@0 | 2813 | { |
michael@0 | 2814 | if (strsrch) { |
michael@0 | 2815 | int32_t result = ucol_getOffset(strsrch->textIter); |
michael@0 | 2816 | if (isOutOfBounds(strsrch->search->textLength, result)) { |
michael@0 | 2817 | return USEARCH_DONE; |
michael@0 | 2818 | } |
michael@0 | 2819 | return result; |
michael@0 | 2820 | } |
michael@0 | 2821 | return USEARCH_DONE; |
michael@0 | 2822 | } |
michael@0 | 2823 | |
michael@0 | 2824 | U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, |
michael@0 | 2825 | USearchAttribute attribute, |
michael@0 | 2826 | USearchAttributeValue value, |
michael@0 | 2827 | UErrorCode *status) |
michael@0 | 2828 | { |
michael@0 | 2829 | if (U_SUCCESS(*status) && strsrch) { |
michael@0 | 2830 | switch (attribute) |
michael@0 | 2831 | { |
michael@0 | 2832 | case USEARCH_OVERLAP : |
michael@0 | 2833 | strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); |
michael@0 | 2834 | break; |
michael@0 | 2835 | case USEARCH_CANONICAL_MATCH : |
michael@0 | 2836 | strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : |
michael@0 | 2837 | FALSE); |
michael@0 | 2838 | break; |
michael@0 | 2839 | case USEARCH_ELEMENT_COMPARISON : |
michael@0 | 2840 | if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { |
michael@0 | 2841 | strsrch->search->elementComparisonType = (int16_t)value; |
michael@0 | 2842 | } else { |
michael@0 | 2843 | strsrch->search->elementComparisonType = 0; |
michael@0 | 2844 | } |
michael@0 | 2845 | break; |
michael@0 | 2846 | case USEARCH_ATTRIBUTE_COUNT : |
michael@0 | 2847 | default: |
michael@0 | 2848 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2849 | } |
michael@0 | 2850 | } |
michael@0 | 2851 | if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { |
michael@0 | 2852 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2853 | } |
michael@0 | 2854 | } |
michael@0 | 2855 | |
michael@0 | 2856 | U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( |
michael@0 | 2857 | const UStringSearch *strsrch, |
michael@0 | 2858 | USearchAttribute attribute) |
michael@0 | 2859 | { |
michael@0 | 2860 | if (strsrch) { |
michael@0 | 2861 | switch (attribute) { |
michael@0 | 2862 | case USEARCH_OVERLAP : |
michael@0 | 2863 | return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : |
michael@0 | 2864 | USEARCH_OFF); |
michael@0 | 2865 | case USEARCH_CANONICAL_MATCH : |
michael@0 | 2866 | return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : |
michael@0 | 2867 | USEARCH_OFF); |
michael@0 | 2868 | case USEARCH_ELEMENT_COMPARISON : |
michael@0 | 2869 | { |
michael@0 | 2870 | int16_t value = strsrch->search->elementComparisonType; |
michael@0 | 2871 | if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { |
michael@0 | 2872 | return (USearchAttributeValue)value; |
michael@0 | 2873 | } else { |
michael@0 | 2874 | return USEARCH_STANDARD_ELEMENT_COMPARISON; |
michael@0 | 2875 | } |
michael@0 | 2876 | } |
michael@0 | 2877 | case USEARCH_ATTRIBUTE_COUNT : |
michael@0 | 2878 | return USEARCH_DEFAULT; |
michael@0 | 2879 | } |
michael@0 | 2880 | } |
michael@0 | 2881 | return USEARCH_DEFAULT; |
michael@0 | 2882 | } |
michael@0 | 2883 | |
michael@0 | 2884 | U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( |
michael@0 | 2885 | const UStringSearch *strsrch) |
michael@0 | 2886 | { |
michael@0 | 2887 | if (strsrch == NULL) { |
michael@0 | 2888 | return USEARCH_DONE; |
michael@0 | 2889 | } |
michael@0 | 2890 | return strsrch->search->matchedIndex; |
michael@0 | 2891 | } |
michael@0 | 2892 | |
michael@0 | 2893 | |
michael@0 | 2894 | U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, |
michael@0 | 2895 | UChar *result, |
michael@0 | 2896 | int32_t resultCapacity, |
michael@0 | 2897 | UErrorCode *status) |
michael@0 | 2898 | { |
michael@0 | 2899 | if (U_FAILURE(*status)) { |
michael@0 | 2900 | return USEARCH_DONE; |
michael@0 | 2901 | } |
michael@0 | 2902 | if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && |
michael@0 | 2903 | result == NULL)) { |
michael@0 | 2904 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2905 | return USEARCH_DONE; |
michael@0 | 2906 | } |
michael@0 | 2907 | |
michael@0 | 2908 | int32_t copylength = strsrch->search->matchedLength; |
michael@0 | 2909 | int32_t copyindex = strsrch->search->matchedIndex; |
michael@0 | 2910 | if (copyindex == USEARCH_DONE) { |
michael@0 | 2911 | u_terminateUChars(result, resultCapacity, 0, status); |
michael@0 | 2912 | return USEARCH_DONE; |
michael@0 | 2913 | } |
michael@0 | 2914 | |
michael@0 | 2915 | if (resultCapacity < copylength) { |
michael@0 | 2916 | copylength = resultCapacity; |
michael@0 | 2917 | } |
michael@0 | 2918 | if (copylength > 0) { |
michael@0 | 2919 | uprv_memcpy(result, strsrch->search->text + copyindex, |
michael@0 | 2920 | copylength * sizeof(UChar)); |
michael@0 | 2921 | } |
michael@0 | 2922 | return u_terminateUChars(result, resultCapacity, |
michael@0 | 2923 | strsrch->search->matchedLength, status); |
michael@0 | 2924 | } |
michael@0 | 2925 | |
michael@0 | 2926 | U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( |
michael@0 | 2927 | const UStringSearch *strsrch) |
michael@0 | 2928 | { |
michael@0 | 2929 | if (strsrch) { |
michael@0 | 2930 | return strsrch->search->matchedLength; |
michael@0 | 2931 | } |
michael@0 | 2932 | return USEARCH_DONE; |
michael@0 | 2933 | } |
michael@0 | 2934 | |
michael@0 | 2935 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2936 | |
michael@0 | 2937 | U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, |
michael@0 | 2938 | UBreakIterator *breakiter, |
michael@0 | 2939 | UErrorCode *status) |
michael@0 | 2940 | { |
michael@0 | 2941 | if (U_SUCCESS(*status) && strsrch) { |
michael@0 | 2942 | strsrch->search->breakIter = breakiter; |
michael@0 | 2943 | if (breakiter) { |
michael@0 | 2944 | ubrk_setText(breakiter, strsrch->search->text, |
michael@0 | 2945 | strsrch->search->textLength, status); |
michael@0 | 2946 | } |
michael@0 | 2947 | } |
michael@0 | 2948 | } |
michael@0 | 2949 | |
michael@0 | 2950 | U_CAPI const UBreakIterator* U_EXPORT2 |
michael@0 | 2951 | usearch_getBreakIterator(const UStringSearch *strsrch) |
michael@0 | 2952 | { |
michael@0 | 2953 | if (strsrch) { |
michael@0 | 2954 | return strsrch->search->breakIter; |
michael@0 | 2955 | } |
michael@0 | 2956 | return NULL; |
michael@0 | 2957 | } |
michael@0 | 2958 | |
michael@0 | 2959 | #endif |
michael@0 | 2960 | |
michael@0 | 2961 | U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, |
michael@0 | 2962 | const UChar *text, |
michael@0 | 2963 | int32_t textlength, |
michael@0 | 2964 | UErrorCode *status) |
michael@0 | 2965 | { |
michael@0 | 2966 | if (U_SUCCESS(*status)) { |
michael@0 | 2967 | if (strsrch == NULL || text == NULL || textlength < -1 || |
michael@0 | 2968 | textlength == 0) { |
michael@0 | 2969 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2970 | } |
michael@0 | 2971 | else { |
michael@0 | 2972 | if (textlength == -1) { |
michael@0 | 2973 | textlength = u_strlen(text); |
michael@0 | 2974 | } |
michael@0 | 2975 | strsrch->search->text = text; |
michael@0 | 2976 | strsrch->search->textLength = textlength; |
michael@0 | 2977 | ucol_setText(strsrch->textIter, text, textlength, status); |
michael@0 | 2978 | strsrch->search->matchedIndex = USEARCH_DONE; |
michael@0 | 2979 | strsrch->search->matchedLength = 0; |
michael@0 | 2980 | strsrch->search->reset = TRUE; |
michael@0 | 2981 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 2982 | if (strsrch->search->breakIter != NULL) { |
michael@0 | 2983 | ubrk_setText(strsrch->search->breakIter, text, |
michael@0 | 2984 | textlength, status); |
michael@0 | 2985 | } |
michael@0 | 2986 | ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); |
michael@0 | 2987 | #endif |
michael@0 | 2988 | } |
michael@0 | 2989 | } |
michael@0 | 2990 | } |
michael@0 | 2991 | |
michael@0 | 2992 | U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, |
michael@0 | 2993 | int32_t *length) |
michael@0 | 2994 | { |
michael@0 | 2995 | if (strsrch) { |
michael@0 | 2996 | *length = strsrch->search->textLength; |
michael@0 | 2997 | return strsrch->search->text; |
michael@0 | 2998 | } |
michael@0 | 2999 | return NULL; |
michael@0 | 3000 | } |
michael@0 | 3001 | |
michael@0 | 3002 | U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, |
michael@0 | 3003 | const UCollator *collator, |
michael@0 | 3004 | UErrorCode *status) |
michael@0 | 3005 | { |
michael@0 | 3006 | if (U_SUCCESS(*status)) { |
michael@0 | 3007 | if (collator == NULL) { |
michael@0 | 3008 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 3009 | return; |
michael@0 | 3010 | } |
michael@0 | 3011 | |
michael@0 | 3012 | if (strsrch) { |
michael@0 | 3013 | if (strsrch->ownCollator && (strsrch->collator != collator)) { |
michael@0 | 3014 | ucol_close((UCollator *)strsrch->collator); |
michael@0 | 3015 | strsrch->ownCollator = FALSE; |
michael@0 | 3016 | } |
michael@0 | 3017 | strsrch->collator = collator; |
michael@0 | 3018 | strsrch->strength = ucol_getStrength(collator); |
michael@0 | 3019 | strsrch->ceMask = getMask(strsrch->strength); |
michael@0 | 3020 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 3021 | ubrk_close(strsrch->search->internalBreakIter); |
michael@0 | 3022 | strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), |
michael@0 | 3023 | strsrch->search->text, strsrch->search->textLength, status); |
michael@0 | 3024 | #endif |
michael@0 | 3025 | // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT |
michael@0 | 3026 | strsrch->toShift = |
michael@0 | 3027 | ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == |
michael@0 | 3028 | UCOL_SHIFTED; |
michael@0 | 3029 | // if status is a failure, ucol_getVariableTop returns 0 |
michael@0 | 3030 | strsrch->variableTop = ucol_getVariableTop(collator, status); |
michael@0 | 3031 | if (U_SUCCESS(*status)) { |
michael@0 | 3032 | initialize(strsrch, status); |
michael@0 | 3033 | if (U_SUCCESS(*status)) { |
michael@0 | 3034 | /* free offset buffer to avoid memory leak before initializing. */ |
michael@0 | 3035 | ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); |
michael@0 | 3036 | uprv_init_collIterate(collator, strsrch->search->text, |
michael@0 | 3037 | strsrch->search->textLength, |
michael@0 | 3038 | &(strsrch->textIter->iteratordata_), |
michael@0 | 3039 | status); |
michael@0 | 3040 | strsrch->utilIter->iteratordata_.coll = collator; |
michael@0 | 3041 | } |
michael@0 | 3042 | } |
michael@0 | 3043 | } |
michael@0 | 3044 | |
michael@0 | 3045 | // **** are these calls needed? |
michael@0 | 3046 | // **** we call uprv_init_pce in initializePatternPCETable |
michael@0 | 3047 | // **** and the CEBuffer constructor... |
michael@0 | 3048 | #if 0 |
michael@0 | 3049 | uprv_init_pce(strsrch->textIter); |
michael@0 | 3050 | uprv_init_pce(strsrch->utilIter); |
michael@0 | 3051 | #endif |
michael@0 | 3052 | } |
michael@0 | 3053 | } |
michael@0 | 3054 | |
michael@0 | 3055 | U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) |
michael@0 | 3056 | { |
michael@0 | 3057 | if (strsrch) { |
michael@0 | 3058 | return (UCollator *)strsrch->collator; |
michael@0 | 3059 | } |
michael@0 | 3060 | return NULL; |
michael@0 | 3061 | } |
michael@0 | 3062 | |
michael@0 | 3063 | U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, |
michael@0 | 3064 | const UChar *pattern, |
michael@0 | 3065 | int32_t patternlength, |
michael@0 | 3066 | UErrorCode *status) |
michael@0 | 3067 | { |
michael@0 | 3068 | if (U_SUCCESS(*status)) { |
michael@0 | 3069 | if (strsrch == NULL || pattern == NULL) { |
michael@0 | 3070 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 3071 | } |
michael@0 | 3072 | else { |
michael@0 | 3073 | if (patternlength == -1) { |
michael@0 | 3074 | patternlength = u_strlen(pattern); |
michael@0 | 3075 | } |
michael@0 | 3076 | if (patternlength == 0) { |
michael@0 | 3077 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 3078 | return; |
michael@0 | 3079 | } |
michael@0 | 3080 | strsrch->pattern.text = pattern; |
michael@0 | 3081 | strsrch->pattern.textLength = patternlength; |
michael@0 | 3082 | initialize(strsrch, status); |
michael@0 | 3083 | } |
michael@0 | 3084 | } |
michael@0 | 3085 | } |
michael@0 | 3086 | |
michael@0 | 3087 | U_CAPI const UChar* U_EXPORT2 |
michael@0 | 3088 | usearch_getPattern(const UStringSearch *strsrch, |
michael@0 | 3089 | int32_t *length) |
michael@0 | 3090 | { |
michael@0 | 3091 | if (strsrch) { |
michael@0 | 3092 | *length = strsrch->pattern.textLength; |
michael@0 | 3093 | return strsrch->pattern.text; |
michael@0 | 3094 | } |
michael@0 | 3095 | return NULL; |
michael@0 | 3096 | } |
michael@0 | 3097 | |
michael@0 | 3098 | // miscellanous methods -------------------------------------------------- |
michael@0 | 3099 | |
michael@0 | 3100 | U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, |
michael@0 | 3101 | UErrorCode *status) |
michael@0 | 3102 | { |
michael@0 | 3103 | if (strsrch && U_SUCCESS(*status)) { |
michael@0 | 3104 | strsrch->search->isForwardSearching = TRUE; |
michael@0 | 3105 | usearch_setOffset(strsrch, 0, status); |
michael@0 | 3106 | if (U_SUCCESS(*status)) { |
michael@0 | 3107 | return usearch_next(strsrch, status); |
michael@0 | 3108 | } |
michael@0 | 3109 | } |
michael@0 | 3110 | return USEARCH_DONE; |
michael@0 | 3111 | } |
michael@0 | 3112 | |
michael@0 | 3113 | U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, |
michael@0 | 3114 | int32_t position, |
michael@0 | 3115 | UErrorCode *status) |
michael@0 | 3116 | { |
michael@0 | 3117 | if (strsrch && U_SUCCESS(*status)) { |
michael@0 | 3118 | strsrch->search->isForwardSearching = TRUE; |
michael@0 | 3119 | // position checked in usearch_setOffset |
michael@0 | 3120 | usearch_setOffset(strsrch, position, status); |
michael@0 | 3121 | if (U_SUCCESS(*status)) { |
michael@0 | 3122 | return usearch_next(strsrch, status); |
michael@0 | 3123 | } |
michael@0 | 3124 | } |
michael@0 | 3125 | return USEARCH_DONE; |
michael@0 | 3126 | } |
michael@0 | 3127 | |
michael@0 | 3128 | U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, |
michael@0 | 3129 | UErrorCode *status) |
michael@0 | 3130 | { |
michael@0 | 3131 | if (strsrch && U_SUCCESS(*status)) { |
michael@0 | 3132 | strsrch->search->isForwardSearching = FALSE; |
michael@0 | 3133 | usearch_setOffset(strsrch, strsrch->search->textLength, status); |
michael@0 | 3134 | if (U_SUCCESS(*status)) { |
michael@0 | 3135 | return usearch_previous(strsrch, status); |
michael@0 | 3136 | } |
michael@0 | 3137 | } |
michael@0 | 3138 | return USEARCH_DONE; |
michael@0 | 3139 | } |
michael@0 | 3140 | |
michael@0 | 3141 | U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, |
michael@0 | 3142 | int32_t position, |
michael@0 | 3143 | UErrorCode *status) |
michael@0 | 3144 | { |
michael@0 | 3145 | if (strsrch && U_SUCCESS(*status)) { |
michael@0 | 3146 | strsrch->search->isForwardSearching = FALSE; |
michael@0 | 3147 | // position checked in usearch_setOffset |
michael@0 | 3148 | usearch_setOffset(strsrch, position, status); |
michael@0 | 3149 | if (U_SUCCESS(*status)) { |
michael@0 | 3150 | return usearch_previous(strsrch, status); |
michael@0 | 3151 | } |
michael@0 | 3152 | } |
michael@0 | 3153 | return USEARCH_DONE; |
michael@0 | 3154 | } |
michael@0 | 3155 | |
michael@0 | 3156 | /** |
michael@0 | 3157 | * If a direction switch is required, we'll count the number of ces till the |
michael@0 | 3158 | * beginning of the collation element iterator and iterate forwards that |
michael@0 | 3159 | * number of times. This is so that we get to the correct point within the |
michael@0 | 3160 | * string to continue the search in. Imagine when we are in the middle of the |
michael@0 | 3161 | * normalization buffer when the change in direction is request. arrrgghh.... |
michael@0 | 3162 | * After searching the offset within the collation element iterator will be |
michael@0 | 3163 | * shifted to the start of the match. If a match is not found, the offset would |
michael@0 | 3164 | * have been set to the end of the text string in the collation element |
michael@0 | 3165 | * iterator. |
michael@0 | 3166 | * Okay, here's my take on normalization buffer. The only time when there can |
michael@0 | 3167 | * be 2 matches within the same normalization is when the pattern is consists |
michael@0 | 3168 | * of all accents. But since the offset returned is from the text string, we |
michael@0 | 3169 | * should not confuse the caller by returning the second match within the |
michael@0 | 3170 | * same normalization buffer. If we do, the 2 results will have the same match |
michael@0 | 3171 | * offsets, and that'll be confusing. I'll return the next match that doesn't |
michael@0 | 3172 | * fall within the same normalization buffer. Note this does not affect the |
michael@0 | 3173 | * results of matches spanning the text and the normalization buffer. |
michael@0 | 3174 | * The position to start searching is taken from the collation element |
michael@0 | 3175 | * iterator. Callers of this API would have to set the offset in the collation |
michael@0 | 3176 | * element iterator before using this method. |
michael@0 | 3177 | */ |
michael@0 | 3178 | U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, |
michael@0 | 3179 | UErrorCode *status) |
michael@0 | 3180 | { |
michael@0 | 3181 | if (U_SUCCESS(*status) && strsrch) { |
michael@0 | 3182 | // note offset is either equivalent to the start of the previous match |
michael@0 | 3183 | // or is set by the user |
michael@0 | 3184 | int32_t offset = usearch_getOffset(strsrch); |
michael@0 | 3185 | USearch *search = strsrch->search; |
michael@0 | 3186 | search->reset = FALSE; |
michael@0 | 3187 | int32_t textlength = search->textLength; |
michael@0 | 3188 | if (search->isForwardSearching) { |
michael@0 | 3189 | #if BOYER_MOORE |
michael@0 | 3190 | if (offset == textlength |
michael@0 | 3191 | || (!search->isOverlap && |
michael@0 | 3192 | (offset + strsrch->pattern.defaultShiftSize > textlength || |
michael@0 | 3193 | (search->matchedIndex != USEARCH_DONE && |
michael@0 | 3194 | offset + search->matchedLength >= textlength)))) { |
michael@0 | 3195 | // not enough characters to match |
michael@0 | 3196 | setMatchNotFound(strsrch); |
michael@0 | 3197 | return USEARCH_DONE; |
michael@0 | 3198 | } |
michael@0 | 3199 | #else |
michael@0 | 3200 | if (offset == textlength || |
michael@0 | 3201 | (! search->isOverlap && |
michael@0 | 3202 | (search->matchedIndex != USEARCH_DONE && |
michael@0 | 3203 | offset + search->matchedLength > textlength))) { |
michael@0 | 3204 | // not enough characters to match |
michael@0 | 3205 | setMatchNotFound(strsrch); |
michael@0 | 3206 | return USEARCH_DONE; |
michael@0 | 3207 | } |
michael@0 | 3208 | #endif |
michael@0 | 3209 | } |
michael@0 | 3210 | else { |
michael@0 | 3211 | // switching direction. |
michael@0 | 3212 | // if matchedIndex == USEARCH_DONE, it means that either a |
michael@0 | 3213 | // setOffset has been called or that previous ran off the text |
michael@0 | 3214 | // string. the iterator would have been set to offset 0 if a |
michael@0 | 3215 | // match is not found. |
michael@0 | 3216 | search->isForwardSearching = TRUE; |
michael@0 | 3217 | if (search->matchedIndex != USEARCH_DONE) { |
michael@0 | 3218 | // there's no need to set the collation element iterator |
michael@0 | 3219 | // the next call to next will set the offset. |
michael@0 | 3220 | return search->matchedIndex; |
michael@0 | 3221 | } |
michael@0 | 3222 | } |
michael@0 | 3223 | |
michael@0 | 3224 | if (U_SUCCESS(*status)) { |
michael@0 | 3225 | if (strsrch->pattern.CELength == 0) { |
michael@0 | 3226 | if (search->matchedIndex == USEARCH_DONE) { |
michael@0 | 3227 | search->matchedIndex = offset; |
michael@0 | 3228 | } |
michael@0 | 3229 | else { // moves by codepoints |
michael@0 | 3230 | U16_FWD_1(search->text, search->matchedIndex, textlength); |
michael@0 | 3231 | } |
michael@0 | 3232 | |
michael@0 | 3233 | search->matchedLength = 0; |
michael@0 | 3234 | setColEIterOffset(strsrch->textIter, search->matchedIndex); |
michael@0 | 3235 | // status checked below |
michael@0 | 3236 | if (search->matchedIndex == textlength) { |
michael@0 | 3237 | search->matchedIndex = USEARCH_DONE; |
michael@0 | 3238 | } |
michael@0 | 3239 | } |
michael@0 | 3240 | else { |
michael@0 | 3241 | if (search->matchedLength > 0) { |
michael@0 | 3242 | // if matchlength is 0 we are at the start of the iteration |
michael@0 | 3243 | if (search->isOverlap) { |
michael@0 | 3244 | ucol_setOffset(strsrch->textIter, offset + 1, status); |
michael@0 | 3245 | } |
michael@0 | 3246 | else { |
michael@0 | 3247 | ucol_setOffset(strsrch->textIter, |
michael@0 | 3248 | offset + search->matchedLength, status); |
michael@0 | 3249 | } |
michael@0 | 3250 | } |
michael@0 | 3251 | else { |
michael@0 | 3252 | // for boundary check purposes. this will ensure that the |
michael@0 | 3253 | // next match will not preceed the current offset |
michael@0 | 3254 | // note search->matchedIndex will always be set to something |
michael@0 | 3255 | // in the code |
michael@0 | 3256 | search->matchedIndex = offset - 1; |
michael@0 | 3257 | } |
michael@0 | 3258 | |
michael@0 | 3259 | if (search->isCanonicalMatch) { |
michael@0 | 3260 | // can't use exact here since extra accents are allowed. |
michael@0 | 3261 | usearch_handleNextCanonical(strsrch, status); |
michael@0 | 3262 | } |
michael@0 | 3263 | else { |
michael@0 | 3264 | usearch_handleNextExact(strsrch, status); |
michael@0 | 3265 | } |
michael@0 | 3266 | } |
michael@0 | 3267 | |
michael@0 | 3268 | if (U_FAILURE(*status)) { |
michael@0 | 3269 | return USEARCH_DONE; |
michael@0 | 3270 | } |
michael@0 | 3271 | |
michael@0 | 3272 | #if !BOYER_MOORE |
michael@0 | 3273 | if (search->matchedIndex == USEARCH_DONE) { |
michael@0 | 3274 | ucol_setOffset(strsrch->textIter, search->textLength, status); |
michael@0 | 3275 | } else { |
michael@0 | 3276 | ucol_setOffset(strsrch->textIter, search->matchedIndex, status); |
michael@0 | 3277 | } |
michael@0 | 3278 | #endif |
michael@0 | 3279 | |
michael@0 | 3280 | return search->matchedIndex; |
michael@0 | 3281 | } |
michael@0 | 3282 | } |
michael@0 | 3283 | return USEARCH_DONE; |
michael@0 | 3284 | } |
michael@0 | 3285 | |
michael@0 | 3286 | U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, |
michael@0 | 3287 | UErrorCode *status) |
michael@0 | 3288 | { |
michael@0 | 3289 | if (U_SUCCESS(*status) && strsrch) { |
michael@0 | 3290 | int32_t offset; |
michael@0 | 3291 | USearch *search = strsrch->search; |
michael@0 | 3292 | if (search->reset) { |
michael@0 | 3293 | offset = search->textLength; |
michael@0 | 3294 | search->isForwardSearching = FALSE; |
michael@0 | 3295 | search->reset = FALSE; |
michael@0 | 3296 | setColEIterOffset(strsrch->textIter, offset); |
michael@0 | 3297 | } |
michael@0 | 3298 | else { |
michael@0 | 3299 | offset = usearch_getOffset(strsrch); |
michael@0 | 3300 | } |
michael@0 | 3301 | |
michael@0 | 3302 | int32_t matchedindex = search->matchedIndex; |
michael@0 | 3303 | if (search->isForwardSearching == TRUE) { |
michael@0 | 3304 | // switching direction. |
michael@0 | 3305 | // if matchedIndex == USEARCH_DONE, it means that either a |
michael@0 | 3306 | // setOffset has been called or that next ran off the text |
michael@0 | 3307 | // string. the iterator would have been set to offset textLength if |
michael@0 | 3308 | // a match is not found. |
michael@0 | 3309 | search->isForwardSearching = FALSE; |
michael@0 | 3310 | if (matchedindex != USEARCH_DONE) { |
michael@0 | 3311 | return matchedindex; |
michael@0 | 3312 | } |
michael@0 | 3313 | } |
michael@0 | 3314 | else { |
michael@0 | 3315 | #if BOYER_MOORE |
michael@0 | 3316 | if (offset == 0 || matchedindex == 0 || |
michael@0 | 3317 | (!search->isOverlap && |
michael@0 | 3318 | (offset < strsrch->pattern.defaultShiftSize || |
michael@0 | 3319 | (matchedindex != USEARCH_DONE && |
michael@0 | 3320 | matchedindex < strsrch->pattern.defaultShiftSize)))) { |
michael@0 | 3321 | // not enough characters to match |
michael@0 | 3322 | setMatchNotFound(strsrch); |
michael@0 | 3323 | return USEARCH_DONE; |
michael@0 | 3324 | } |
michael@0 | 3325 | #else |
michael@0 | 3326 | // Could check pattern length, but the |
michael@0 | 3327 | // linear search will do the right thing |
michael@0 | 3328 | if (offset == 0 || matchedindex == 0) { |
michael@0 | 3329 | setMatchNotFound(strsrch); |
michael@0 | 3330 | return USEARCH_DONE; |
michael@0 | 3331 | } |
michael@0 | 3332 | #endif |
michael@0 | 3333 | } |
michael@0 | 3334 | |
michael@0 | 3335 | if (U_SUCCESS(*status)) { |
michael@0 | 3336 | if (strsrch->pattern.CELength == 0) { |
michael@0 | 3337 | search->matchedIndex = |
michael@0 | 3338 | (matchedindex == USEARCH_DONE ? offset : matchedindex); |
michael@0 | 3339 | if (search->matchedIndex == 0) { |
michael@0 | 3340 | setMatchNotFound(strsrch); |
michael@0 | 3341 | // status checked below |
michael@0 | 3342 | } |
michael@0 | 3343 | else { // move by codepoints |
michael@0 | 3344 | U16_BACK_1(search->text, 0, search->matchedIndex); |
michael@0 | 3345 | setColEIterOffset(strsrch->textIter, search->matchedIndex); |
michael@0 | 3346 | // status checked below |
michael@0 | 3347 | search->matchedLength = 0; |
michael@0 | 3348 | } |
michael@0 | 3349 | } |
michael@0 | 3350 | else { |
michael@0 | 3351 | if (strsrch->search->isCanonicalMatch) { |
michael@0 | 3352 | // can't use exact here since extra accents are allowed. |
michael@0 | 3353 | usearch_handlePreviousCanonical(strsrch, status); |
michael@0 | 3354 | // status checked below |
michael@0 | 3355 | } |
michael@0 | 3356 | else { |
michael@0 | 3357 | usearch_handlePreviousExact(strsrch, status); |
michael@0 | 3358 | // status checked below |
michael@0 | 3359 | } |
michael@0 | 3360 | } |
michael@0 | 3361 | |
michael@0 | 3362 | if (U_FAILURE(*status)) { |
michael@0 | 3363 | return USEARCH_DONE; |
michael@0 | 3364 | } |
michael@0 | 3365 | |
michael@0 | 3366 | return search->matchedIndex; |
michael@0 | 3367 | } |
michael@0 | 3368 | } |
michael@0 | 3369 | return USEARCH_DONE; |
michael@0 | 3370 | } |
michael@0 | 3371 | |
michael@0 | 3372 | |
michael@0 | 3373 | |
michael@0 | 3374 | U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) |
michael@0 | 3375 | { |
michael@0 | 3376 | /* |
michael@0 | 3377 | reset is setting the attributes that are already in |
michael@0 | 3378 | string search, hence all attributes in the collator should |
michael@0 | 3379 | be retrieved without any problems |
michael@0 | 3380 | */ |
michael@0 | 3381 | if (strsrch) { |
michael@0 | 3382 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 3383 | UBool sameCollAttribute = TRUE; |
michael@0 | 3384 | uint32_t ceMask; |
michael@0 | 3385 | UBool shift; |
michael@0 | 3386 | uint32_t varTop; |
michael@0 | 3387 | |
michael@0 | 3388 | // **** hack to deal w/ how processed CEs encode quaternary **** |
michael@0 | 3389 | UCollationStrength newStrength = ucol_getStrength(strsrch->collator); |
michael@0 | 3390 | if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || |
michael@0 | 3391 | (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { |
michael@0 | 3392 | sameCollAttribute = FALSE; |
michael@0 | 3393 | } |
michael@0 | 3394 | |
michael@0 | 3395 | strsrch->strength = ucol_getStrength(strsrch->collator); |
michael@0 | 3396 | ceMask = getMask(strsrch->strength); |
michael@0 | 3397 | if (strsrch->ceMask != ceMask) { |
michael@0 | 3398 | strsrch->ceMask = ceMask; |
michael@0 | 3399 | sameCollAttribute = FALSE; |
michael@0 | 3400 | } |
michael@0 | 3401 | |
michael@0 | 3402 | // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT |
michael@0 | 3403 | shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, |
michael@0 | 3404 | &status) == UCOL_SHIFTED; |
michael@0 | 3405 | if (strsrch->toShift != shift) { |
michael@0 | 3406 | strsrch->toShift = shift; |
michael@0 | 3407 | sameCollAttribute = FALSE; |
michael@0 | 3408 | } |
michael@0 | 3409 | |
michael@0 | 3410 | // if status is a failure, ucol_getVariableTop returns 0 |
michael@0 | 3411 | varTop = ucol_getVariableTop(strsrch->collator, &status); |
michael@0 | 3412 | if (strsrch->variableTop != varTop) { |
michael@0 | 3413 | strsrch->variableTop = varTop; |
michael@0 | 3414 | sameCollAttribute = FALSE; |
michael@0 | 3415 | } |
michael@0 | 3416 | if (!sameCollAttribute) { |
michael@0 | 3417 | initialize(strsrch, &status); |
michael@0 | 3418 | } |
michael@0 | 3419 | /* free offset buffer to avoid memory leak before initializing. */ |
michael@0 | 3420 | ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_)); |
michael@0 | 3421 | uprv_init_collIterate(strsrch->collator, strsrch->search->text, |
michael@0 | 3422 | strsrch->search->textLength, |
michael@0 | 3423 | &(strsrch->textIter->iteratordata_), |
michael@0 | 3424 | &status); |
michael@0 | 3425 | strsrch->search->matchedLength = 0; |
michael@0 | 3426 | strsrch->search->matchedIndex = USEARCH_DONE; |
michael@0 | 3427 | strsrch->search->isOverlap = FALSE; |
michael@0 | 3428 | strsrch->search->isCanonicalMatch = FALSE; |
michael@0 | 3429 | strsrch->search->elementComparisonType = 0; |
michael@0 | 3430 | strsrch->search->isForwardSearching = TRUE; |
michael@0 | 3431 | strsrch->search->reset = TRUE; |
michael@0 | 3432 | } |
michael@0 | 3433 | } |
michael@0 | 3434 | |
michael@0 | 3435 | // |
michael@0 | 3436 | // CEI Collation Element + source text index. |
michael@0 | 3437 | // These structs are kept in the circular buffer. |
michael@0 | 3438 | // |
michael@0 | 3439 | struct CEI { |
michael@0 | 3440 | int64_t ce; |
michael@0 | 3441 | int32_t lowIndex; |
michael@0 | 3442 | int32_t highIndex; |
michael@0 | 3443 | }; |
michael@0 | 3444 | |
michael@0 | 3445 | U_NAMESPACE_BEGIN |
michael@0 | 3446 | |
michael@0 | 3447 | |
michael@0 | 3448 | // |
michael@0 | 3449 | // CEBuffer A circular buffer of CEs from the text being searched. |
michael@0 | 3450 | // |
michael@0 | 3451 | #define DEFAULT_CEBUFFER_SIZE 96 |
michael@0 | 3452 | #define CEBUFFER_EXTRA 32 |
michael@0 | 3453 | // Some typical max values to make buffer size more reasonable for asymmetric search. |
michael@0 | 3454 | // #8694 is for a better long-term solution to allocation of this buffer. |
michael@0 | 3455 | #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 |
michael@0 | 3456 | #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 |
michael@0 | 3457 | #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) |
michael@0 | 3458 | struct CEBuffer { |
michael@0 | 3459 | CEI defBuf[DEFAULT_CEBUFFER_SIZE]; |
michael@0 | 3460 | CEI *buf; |
michael@0 | 3461 | int32_t bufSize; |
michael@0 | 3462 | int32_t firstIx; |
michael@0 | 3463 | int32_t limitIx; |
michael@0 | 3464 | UCollationElements *ceIter; |
michael@0 | 3465 | UStringSearch *strSearch; |
michael@0 | 3466 | |
michael@0 | 3467 | |
michael@0 | 3468 | |
michael@0 | 3469 | CEBuffer(UStringSearch *ss, UErrorCode *status); |
michael@0 | 3470 | ~CEBuffer(); |
michael@0 | 3471 | const CEI *get(int32_t index); |
michael@0 | 3472 | const CEI *getPrevious(int32_t index); |
michael@0 | 3473 | }; |
michael@0 | 3474 | |
michael@0 | 3475 | |
michael@0 | 3476 | CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { |
michael@0 | 3477 | buf = defBuf; |
michael@0 | 3478 | strSearch = ss; |
michael@0 | 3479 | bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA; |
michael@0 | 3480 | if (ss->search->elementComparisonType != 0) { |
michael@0 | 3481 | const UChar * patText = ss->pattern.text; |
michael@0 | 3482 | if (patText) { |
michael@0 | 3483 | const UChar * patTextLimit = patText + ss->pattern.textLength; |
michael@0 | 3484 | while ( patText < patTextLimit ) { |
michael@0 | 3485 | UChar c = *patText++; |
michael@0 | 3486 | if (MIGHT_BE_JAMO_L(c)) { |
michael@0 | 3487 | bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; |
michael@0 | 3488 | } else { |
michael@0 | 3489 | // No check for surrogates, we might allocate slightly more buffer than necessary. |
michael@0 | 3490 | bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; |
michael@0 | 3491 | } |
michael@0 | 3492 | } |
michael@0 | 3493 | } |
michael@0 | 3494 | } |
michael@0 | 3495 | ceIter = ss->textIter; |
michael@0 | 3496 | firstIx = 0; |
michael@0 | 3497 | limitIx = 0; |
michael@0 | 3498 | |
michael@0 | 3499 | uprv_init_pce(ceIter); |
michael@0 | 3500 | |
michael@0 | 3501 | if (bufSize>DEFAULT_CEBUFFER_SIZE) { |
michael@0 | 3502 | buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); |
michael@0 | 3503 | if (buf == NULL) { |
michael@0 | 3504 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 3505 | } |
michael@0 | 3506 | } |
michael@0 | 3507 | } |
michael@0 | 3508 | |
michael@0 | 3509 | // TODO: add a reset or init function so that allocated |
michael@0 | 3510 | // buffers can be retained & reused. |
michael@0 | 3511 | |
michael@0 | 3512 | CEBuffer::~CEBuffer() { |
michael@0 | 3513 | if (buf != defBuf) { |
michael@0 | 3514 | uprv_free(buf); |
michael@0 | 3515 | } |
michael@0 | 3516 | } |
michael@0 | 3517 | |
michael@0 | 3518 | |
michael@0 | 3519 | // Get the CE with the specified index. |
michael@0 | 3520 | // Index must be in the range |
michael@0 | 3521 | // n-history_size < index < n+1 |
michael@0 | 3522 | // where n is the largest index to have been fetched by some previous call to this function. |
michael@0 | 3523 | // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. |
michael@0 | 3524 | // |
michael@0 | 3525 | const CEI *CEBuffer::get(int32_t index) { |
michael@0 | 3526 | int i = index % bufSize; |
michael@0 | 3527 | |
michael@0 | 3528 | if (index>=firstIx && index<limitIx) { |
michael@0 | 3529 | // The request was for an entry already in our buffer. |
michael@0 | 3530 | // Just return it. |
michael@0 | 3531 | return &buf[i]; |
michael@0 | 3532 | } |
michael@0 | 3533 | |
michael@0 | 3534 | // Caller is requesting a new, never accessed before, CE. |
michael@0 | 3535 | // Verify that it is the next one in sequence, which is all |
michael@0 | 3536 | // that is allowed. |
michael@0 | 3537 | if (index != limitIx) { |
michael@0 | 3538 | U_ASSERT(FALSE); |
michael@0 | 3539 | |
michael@0 | 3540 | return NULL; |
michael@0 | 3541 | } |
michael@0 | 3542 | |
michael@0 | 3543 | // Manage the circular CE buffer indexing |
michael@0 | 3544 | limitIx++; |
michael@0 | 3545 | |
michael@0 | 3546 | if (limitIx - firstIx >= bufSize) { |
michael@0 | 3547 | // The buffer is full, knock out the lowest-indexed entry. |
michael@0 | 3548 | firstIx++; |
michael@0 | 3549 | } |
michael@0 | 3550 | |
michael@0 | 3551 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 3552 | |
michael@0 | 3553 | buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); |
michael@0 | 3554 | |
michael@0 | 3555 | return &buf[i]; |
michael@0 | 3556 | } |
michael@0 | 3557 | |
michael@0 | 3558 | // Get the CE with the specified index. |
michael@0 | 3559 | // Index must be in the range |
michael@0 | 3560 | // n-history_size < index < n+1 |
michael@0 | 3561 | // where n is the largest index to have been fetched by some previous call to this function. |
michael@0 | 3562 | // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. |
michael@0 | 3563 | // |
michael@0 | 3564 | const CEI *CEBuffer::getPrevious(int32_t index) { |
michael@0 | 3565 | int i = index % bufSize; |
michael@0 | 3566 | |
michael@0 | 3567 | if (index>=firstIx && index<limitIx) { |
michael@0 | 3568 | // The request was for an entry already in our buffer. |
michael@0 | 3569 | // Just return it. |
michael@0 | 3570 | return &buf[i]; |
michael@0 | 3571 | } |
michael@0 | 3572 | |
michael@0 | 3573 | // Caller is requesting a new, never accessed before, CE. |
michael@0 | 3574 | // Verify that it is the next one in sequence, which is all |
michael@0 | 3575 | // that is allowed. |
michael@0 | 3576 | if (index != limitIx) { |
michael@0 | 3577 | U_ASSERT(FALSE); |
michael@0 | 3578 | |
michael@0 | 3579 | return NULL; |
michael@0 | 3580 | } |
michael@0 | 3581 | |
michael@0 | 3582 | // Manage the circular CE buffer indexing |
michael@0 | 3583 | limitIx++; |
michael@0 | 3584 | |
michael@0 | 3585 | if (limitIx - firstIx >= bufSize) { |
michael@0 | 3586 | // The buffer is full, knock out the lowest-indexed entry. |
michael@0 | 3587 | firstIx++; |
michael@0 | 3588 | } |
michael@0 | 3589 | |
michael@0 | 3590 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 3591 | |
michael@0 | 3592 | buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status); |
michael@0 | 3593 | |
michael@0 | 3594 | return &buf[i]; |
michael@0 | 3595 | } |
michael@0 | 3596 | |
michael@0 | 3597 | U_NAMESPACE_END |
michael@0 | 3598 | |
michael@0 | 3599 | |
michael@0 | 3600 | // #define USEARCH_DEBUG |
michael@0 | 3601 | |
michael@0 | 3602 | #ifdef USEARCH_DEBUG |
michael@0 | 3603 | #include <stdio.h> |
michael@0 | 3604 | #include <stdlib.h> |
michael@0 | 3605 | #endif |
michael@0 | 3606 | |
michael@0 | 3607 | /* |
michael@0 | 3608 | * Find the next break boundary after startIndex. If the UStringSearch object |
michael@0 | 3609 | * has an external break iterator, use that. Otherwise use the internal character |
michael@0 | 3610 | * break iterator. |
michael@0 | 3611 | */ |
michael@0 | 3612 | static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { |
michael@0 | 3613 | #if 0 |
michael@0 | 3614 | const UChar *text = strsrch->search->text; |
michael@0 | 3615 | int32_t textLen = strsrch->search->textLength; |
michael@0 | 3616 | |
michael@0 | 3617 | U_ASSERT(startIndex>=0); |
michael@0 | 3618 | U_ASSERT(startIndex<=textLen); |
michael@0 | 3619 | |
michael@0 | 3620 | if (startIndex >= textLen) { |
michael@0 | 3621 | return startIndex; |
michael@0 | 3622 | } |
michael@0 | 3623 | |
michael@0 | 3624 | UChar32 c; |
michael@0 | 3625 | int32_t i = startIndex; |
michael@0 | 3626 | U16_NEXT(text, i, textLen, c); |
michael@0 | 3627 | |
michael@0 | 3628 | // If we are on a control character, stop without looking for combining marks. |
michael@0 | 3629 | // Control characters do not combine. |
michael@0 | 3630 | int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
michael@0 | 3631 | if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { |
michael@0 | 3632 | return i; |
michael@0 | 3633 | } |
michael@0 | 3634 | |
michael@0 | 3635 | // The initial character was not a control, and can thus accept trailing |
michael@0 | 3636 | // combining characters. Advance over however many of them there are. |
michael@0 | 3637 | int32_t indexOfLastCharChecked; |
michael@0 | 3638 | for (;;) { |
michael@0 | 3639 | indexOfLastCharChecked = i; |
michael@0 | 3640 | if (i>=textLen) { |
michael@0 | 3641 | break; |
michael@0 | 3642 | } |
michael@0 | 3643 | U16_NEXT(text, i, textLen, c); |
michael@0 | 3644 | gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
michael@0 | 3645 | if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
michael@0 | 3646 | break; |
michael@0 | 3647 | } |
michael@0 | 3648 | } |
michael@0 | 3649 | return indexOfLastCharChecked; |
michael@0 | 3650 | #elif !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 3651 | UBreakIterator *breakiterator = strsrch->search->breakIter; |
michael@0 | 3652 | |
michael@0 | 3653 | if (breakiterator == NULL) { |
michael@0 | 3654 | breakiterator = strsrch->search->internalBreakIter; |
michael@0 | 3655 | } |
michael@0 | 3656 | |
michael@0 | 3657 | if (breakiterator != NULL) { |
michael@0 | 3658 | return ubrk_following(breakiterator, startIndex); |
michael@0 | 3659 | } |
michael@0 | 3660 | |
michael@0 | 3661 | return startIndex; |
michael@0 | 3662 | #else |
michael@0 | 3663 | // **** or should we use the original code? **** |
michael@0 | 3664 | return startIndex; |
michael@0 | 3665 | #endif |
michael@0 | 3666 | |
michael@0 | 3667 | } |
michael@0 | 3668 | |
michael@0 | 3669 | /* |
michael@0 | 3670 | * Returns TRUE if index is on a break boundary. If the UStringSearch |
michael@0 | 3671 | * has an external break iterator, test using that, otherwise test |
michael@0 | 3672 | * using the internal character break iterator. |
michael@0 | 3673 | */ |
michael@0 | 3674 | static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { |
michael@0 | 3675 | #if 0 |
michael@0 | 3676 | const UChar *text = strsrch->search->text; |
michael@0 | 3677 | int32_t textLen = strsrch->search->textLength; |
michael@0 | 3678 | |
michael@0 | 3679 | U_ASSERT(index>=0); |
michael@0 | 3680 | U_ASSERT(index<=textLen); |
michael@0 | 3681 | |
michael@0 | 3682 | if (index>=textLen || index<=0) { |
michael@0 | 3683 | return TRUE; |
michael@0 | 3684 | } |
michael@0 | 3685 | |
michael@0 | 3686 | // If the character at the current index is not a GRAPHEME_EXTEND |
michael@0 | 3687 | // then we can not be within a combining sequence. |
michael@0 | 3688 | UChar32 c; |
michael@0 | 3689 | U16_GET(text, 0, index, textLen, c); |
michael@0 | 3690 | int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
michael@0 | 3691 | if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
michael@0 | 3692 | return TRUE; |
michael@0 | 3693 | } |
michael@0 | 3694 | |
michael@0 | 3695 | // We are at a combining mark. If the preceding character is anything |
michael@0 | 3696 | // except a CONTROL, CR or LF, we are in a combining sequence. |
michael@0 | 3697 | U16_PREV(text, 0, index, c); |
michael@0 | 3698 | gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
michael@0 | 3699 | UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); |
michael@0 | 3700 | return !combining; |
michael@0 | 3701 | #elif !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 3702 | UBreakIterator *breakiterator = strsrch->search->breakIter; |
michael@0 | 3703 | |
michael@0 | 3704 | if (breakiterator == NULL) { |
michael@0 | 3705 | breakiterator = strsrch->search->internalBreakIter; |
michael@0 | 3706 | } |
michael@0 | 3707 | |
michael@0 | 3708 | return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); |
michael@0 | 3709 | #else |
michael@0 | 3710 | // **** or use the original code? **** |
michael@0 | 3711 | return TRUE; |
michael@0 | 3712 | #endif |
michael@0 | 3713 | } |
michael@0 | 3714 | |
michael@0 | 3715 | #if 0 |
michael@0 | 3716 | static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) |
michael@0 | 3717 | { |
michael@0 | 3718 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 3719 | UBreakIterator *breakiterator = strsrch->search->breakIter; |
michael@0 | 3720 | |
michael@0 | 3721 | if (breakiterator != NULL) { |
michael@0 | 3722 | int32_t startindex = ubrk_first(breakiterator); |
michael@0 | 3723 | int32_t endindex = ubrk_last(breakiterator); |
michael@0 | 3724 | |
michael@0 | 3725 | // out-of-range indexes are never boundary positions |
michael@0 | 3726 | if (start < startindex || start > endindex || |
michael@0 | 3727 | end < startindex || end > endindex) { |
michael@0 | 3728 | return FALSE; |
michael@0 | 3729 | } |
michael@0 | 3730 | |
michael@0 | 3731 | return ubrk_isBoundary(breakiterator, start) && |
michael@0 | 3732 | ubrk_isBoundary(breakiterator, end); |
michael@0 | 3733 | } |
michael@0 | 3734 | #endif |
michael@0 | 3735 | |
michael@0 | 3736 | return TRUE; |
michael@0 | 3737 | } |
michael@0 | 3738 | #endif |
michael@0 | 3739 | |
michael@0 | 3740 | typedef enum { |
michael@0 | 3741 | U_CE_MATCH = -1, |
michael@0 | 3742 | U_CE_NO_MATCH = 0, |
michael@0 | 3743 | U_CE_SKIP_TARG, |
michael@0 | 3744 | U_CE_SKIP_PATN |
michael@0 | 3745 | } UCompareCEsResult; |
michael@0 | 3746 | #define U_CE_LEVEL2_BASE 0x00000005 |
michael@0 | 3747 | #define U_CE_LEVEL3_BASE 0x00050000 |
michael@0 | 3748 | |
michael@0 | 3749 | static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { |
michael@0 | 3750 | if (targCE == patCE) { |
michael@0 | 3751 | return U_CE_MATCH; |
michael@0 | 3752 | } |
michael@0 | 3753 | if (compareType == 0) { |
michael@0 | 3754 | return U_CE_NO_MATCH; |
michael@0 | 3755 | } |
michael@0 | 3756 | |
michael@0 | 3757 | int64_t targCEshifted = targCE >> 32; |
michael@0 | 3758 | int64_t patCEshifted = patCE >> 32; |
michael@0 | 3759 | int64_t mask; |
michael@0 | 3760 | |
michael@0 | 3761 | mask = 0xFFFF0000; |
michael@0 | 3762 | int32_t targLev1 = (int32_t)(targCEshifted & mask); |
michael@0 | 3763 | int32_t patLev1 = (int32_t)(patCEshifted & mask); |
michael@0 | 3764 | if ( targLev1 != patLev1 ) { |
michael@0 | 3765 | if ( targLev1 == 0 ) { |
michael@0 | 3766 | return U_CE_SKIP_TARG; |
michael@0 | 3767 | } |
michael@0 | 3768 | if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { |
michael@0 | 3769 | return U_CE_SKIP_PATN; |
michael@0 | 3770 | } |
michael@0 | 3771 | return U_CE_NO_MATCH; |
michael@0 | 3772 | } |
michael@0 | 3773 | |
michael@0 | 3774 | mask = 0x0000FFFF; |
michael@0 | 3775 | int32_t targLev2 = (int32_t)(targCEshifted & mask); |
michael@0 | 3776 | int32_t patLev2 = (int32_t)(patCEshifted & mask); |
michael@0 | 3777 | if ( targLev2 != patLev2 ) { |
michael@0 | 3778 | if ( targLev2 == 0 ) { |
michael@0 | 3779 | return U_CE_SKIP_TARG; |
michael@0 | 3780 | } |
michael@0 | 3781 | if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { |
michael@0 | 3782 | return U_CE_SKIP_PATN; |
michael@0 | 3783 | } |
michael@0 | 3784 | return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? |
michael@0 | 3785 | U_CE_MATCH: U_CE_NO_MATCH; |
michael@0 | 3786 | } |
michael@0 | 3787 | |
michael@0 | 3788 | mask = 0xFFFF0000; |
michael@0 | 3789 | int32_t targLev3 = (int32_t)(targCE & mask); |
michael@0 | 3790 | int32_t patLev3 = (int32_t)(patCE & mask); |
michael@0 | 3791 | if ( targLev3 != patLev3 ) { |
michael@0 | 3792 | return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? |
michael@0 | 3793 | U_CE_MATCH: U_CE_NO_MATCH; |
michael@0 | 3794 | } |
michael@0 | 3795 | |
michael@0 | 3796 | return U_CE_MATCH; |
michael@0 | 3797 | } |
michael@0 | 3798 | |
michael@0 | 3799 | #if BOYER_MOORE |
michael@0 | 3800 | // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s |
michael@0 | 3801 | #endif |
michael@0 | 3802 | |
michael@0 | 3803 | U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, |
michael@0 | 3804 | int32_t startIdx, |
michael@0 | 3805 | int32_t *matchStart, |
michael@0 | 3806 | int32_t *matchLimit, |
michael@0 | 3807 | UErrorCode *status) |
michael@0 | 3808 | { |
michael@0 | 3809 | if (U_FAILURE(*status)) { |
michael@0 | 3810 | return FALSE; |
michael@0 | 3811 | } |
michael@0 | 3812 | |
michael@0 | 3813 | // TODO: reject search patterns beginning with a combining char. |
michael@0 | 3814 | |
michael@0 | 3815 | #ifdef USEARCH_DEBUG |
michael@0 | 3816 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 3817 | printf("Pattern CEs\n"); |
michael@0 | 3818 | for (int ii=0; ii<strsrch->pattern.CELength; ii++) { |
michael@0 | 3819 | printf(" %8x", strsrch->pattern.CE[ii]); |
michael@0 | 3820 | } |
michael@0 | 3821 | printf("\n"); |
michael@0 | 3822 | } |
michael@0 | 3823 | |
michael@0 | 3824 | #endif |
michael@0 | 3825 | // Input parameter sanity check. |
michael@0 | 3826 | // TODO: should input indicies clip to the text length |
michael@0 | 3827 | // in the same way that UText does. |
michael@0 | 3828 | if(strsrch->pattern.CELength == 0 || |
michael@0 | 3829 | startIdx < 0 || |
michael@0 | 3830 | startIdx > strsrch->search->textLength || |
michael@0 | 3831 | strsrch->pattern.CE == NULL) { |
michael@0 | 3832 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 3833 | return FALSE; |
michael@0 | 3834 | } |
michael@0 | 3835 | |
michael@0 | 3836 | if (strsrch->pattern.PCE == NULL) { |
michael@0 | 3837 | initializePatternPCETable(strsrch, status); |
michael@0 | 3838 | } |
michael@0 | 3839 | |
michael@0 | 3840 | ucol_setOffset(strsrch->textIter, startIdx, status); |
michael@0 | 3841 | CEBuffer ceb(strsrch, status); |
michael@0 | 3842 | |
michael@0 | 3843 | |
michael@0 | 3844 | int32_t targetIx = 0; |
michael@0 | 3845 | const CEI *targetCEI = NULL; |
michael@0 | 3846 | int32_t patIx; |
michael@0 | 3847 | UBool found; |
michael@0 | 3848 | |
michael@0 | 3849 | int32_t mStart = -1; |
michael@0 | 3850 | int32_t mLimit = -1; |
michael@0 | 3851 | int32_t minLimit; |
michael@0 | 3852 | int32_t maxLimit; |
michael@0 | 3853 | |
michael@0 | 3854 | |
michael@0 | 3855 | |
michael@0 | 3856 | // Outer loop moves over match starting positions in the |
michael@0 | 3857 | // target CE space. |
michael@0 | 3858 | // Here we see the target as a sequence of collation elements, resulting from the following: |
michael@0 | 3859 | // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied |
michael@0 | 3860 | // (for example, digraphs such as IJ may be broken into two characters). |
michael@0 | 3861 | // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next |
michael@0 | 3862 | // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these |
michael@0 | 3863 | // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t |
michael@0 | 3864 | // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), |
michael@0 | 3865 | // then the CE is deleted, so the following code sees only CEs that are relevant. |
michael@0 | 3866 | // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. |
michael@0 | 3867 | // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text |
michael@0 | 3868 | // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). |
michael@0 | 3869 | // |
michael@0 | 3870 | for(targetIx=0; ; targetIx++) |
michael@0 | 3871 | { |
michael@0 | 3872 | found = TRUE; |
michael@0 | 3873 | // Inner loop checks for a match beginning at each |
michael@0 | 3874 | // position from the outer loop. |
michael@0 | 3875 | int32_t targetIxOffset = 0; |
michael@0 | 3876 | int64_t patCE = 0; |
michael@0 | 3877 | // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer |
michael@0 | 3878 | // (compared to the last CE fetched for the previous targetIx value) as we need to go |
michael@0 | 3879 | // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. |
michael@0 | 3880 | const CEI *firstCEI = ceb.get(targetIx); |
michael@0 | 3881 | if (firstCEI == NULL) { |
michael@0 | 3882 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 3883 | found = FALSE; |
michael@0 | 3884 | break; |
michael@0 | 3885 | } |
michael@0 | 3886 | |
michael@0 | 3887 | for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) { |
michael@0 | 3888 | patCE = strsrch->pattern.PCE[patIx]; |
michael@0 | 3889 | targetCEI = ceb.get(targetIx+patIx+targetIxOffset); |
michael@0 | 3890 | // Compare CE from target string with CE from the pattern. |
michael@0 | 3891 | // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, |
michael@0 | 3892 | // which will fail the compare, below. |
michael@0 | 3893 | UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); |
michael@0 | 3894 | if ( ceMatch == U_CE_NO_MATCH ) { |
michael@0 | 3895 | found = FALSE; |
michael@0 | 3896 | break; |
michael@0 | 3897 | } else if ( ceMatch > U_CE_NO_MATCH ) { |
michael@0 | 3898 | if ( ceMatch == U_CE_SKIP_TARG ) { |
michael@0 | 3899 | // redo with same patCE, next targCE |
michael@0 | 3900 | patIx--; |
michael@0 | 3901 | targetIxOffset++; |
michael@0 | 3902 | } else { // ceMatch == U_CE_SKIP_PATN |
michael@0 | 3903 | // redo with same targCE, next patCE |
michael@0 | 3904 | targetIxOffset--; |
michael@0 | 3905 | } |
michael@0 | 3906 | } |
michael@0 | 3907 | } |
michael@0 | 3908 | targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far |
michael@0 | 3909 | |
michael@0 | 3910 | if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { |
michael@0 | 3911 | // No match at this targetIx. Try again at the next. |
michael@0 | 3912 | continue; |
michael@0 | 3913 | } |
michael@0 | 3914 | |
michael@0 | 3915 | if (!found) { |
michael@0 | 3916 | // No match at all, we have run off the end of the target text. |
michael@0 | 3917 | break; |
michael@0 | 3918 | } |
michael@0 | 3919 | |
michael@0 | 3920 | |
michael@0 | 3921 | // We have found a match in CE space. |
michael@0 | 3922 | // Now determine the bounds in string index space. |
michael@0 | 3923 | // There still is a chance of match failure if the CE range not correspond to |
michael@0 | 3924 | // an acceptable character range. |
michael@0 | 3925 | // |
michael@0 | 3926 | const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); |
michael@0 | 3927 | |
michael@0 | 3928 | mStart = firstCEI->lowIndex; |
michael@0 | 3929 | minLimit = lastCEI->lowIndex; |
michael@0 | 3930 | |
michael@0 | 3931 | // Look at the CE following the match. If it is UCOL_NULLORDER the match |
michael@0 | 3932 | // extended to the end of input, and the match is good. |
michael@0 | 3933 | |
michael@0 | 3934 | // Look at the high and low indices of the CE following the match. If |
michael@0 | 3935 | // they are the same it means one of two things: |
michael@0 | 3936 | // 1. The match extended to the last CE from the target text, which is OK, or |
michael@0 | 3937 | // 2. The last CE that was part of the match is in an expansion that extends |
michael@0 | 3938 | // to the first CE after the match. In this case, we reject the match. |
michael@0 | 3939 | const CEI *nextCEI = 0; |
michael@0 | 3940 | if (strsrch->search->elementComparisonType == 0) { |
michael@0 | 3941 | nextCEI = ceb.get(targetIx + targetIxOffset); |
michael@0 | 3942 | maxLimit = nextCEI->lowIndex; |
michael@0 | 3943 | if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { |
michael@0 | 3944 | found = FALSE; |
michael@0 | 3945 | } |
michael@0 | 3946 | } else { |
michael@0 | 3947 | for ( ; ; ++targetIxOffset ) { |
michael@0 | 3948 | nextCEI = ceb.get(targetIx + targetIxOffset); |
michael@0 | 3949 | maxLimit = nextCEI->lowIndex; |
michael@0 | 3950 | // If we are at the end of the target too, match succeeds |
michael@0 | 3951 | if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { |
michael@0 | 3952 | break; |
michael@0 | 3953 | } |
michael@0 | 3954 | // As long as the next CE has primary weight of 0, |
michael@0 | 3955 | // it is part of the last target element matched by the pattern; |
michael@0 | 3956 | // make sure it can be part of a match with the last patCE |
michael@0 | 3957 | if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { |
michael@0 | 3958 | UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); |
michael@0 | 3959 | if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { |
michael@0 | 3960 | found = FALSE; |
michael@0 | 3961 | break; |
michael@0 | 3962 | } |
michael@0 | 3963 | // If lowIndex == highIndex, this target CE is part of an expansion of the last matched |
michael@0 | 3964 | // target element, but it has non-zero primary weight => match fails |
michael@0 | 3965 | } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { |
michael@0 | 3966 | found = false; |
michael@0 | 3967 | break; |
michael@0 | 3968 | // Else the target CE is not part of an expansion of the last matched element, match succeeds |
michael@0 | 3969 | } else { |
michael@0 | 3970 | break; |
michael@0 | 3971 | } |
michael@0 | 3972 | } |
michael@0 | 3973 | } |
michael@0 | 3974 | |
michael@0 | 3975 | |
michael@0 | 3976 | // Check for the start of the match being within a combining sequence. |
michael@0 | 3977 | // This can happen if the pattern itself begins with a combining char, and |
michael@0 | 3978 | // the match found combining marks in the target text that were attached |
michael@0 | 3979 | // to something else. |
michael@0 | 3980 | // This type of match should be rejected for not completely consuming a |
michael@0 | 3981 | // combining sequence. |
michael@0 | 3982 | if (!isBreakBoundary(strsrch, mStart)) { |
michael@0 | 3983 | found = FALSE; |
michael@0 | 3984 | } |
michael@0 | 3985 | |
michael@0 | 3986 | // Check for the start of the match being within an Collation Element Expansion, |
michael@0 | 3987 | // meaning that the first char of the match is only partially matched. |
michael@0 | 3988 | // With exapnsions, the first CE will report the index of the source |
michael@0 | 3989 | // character, and all subsequent (expansions) CEs will report the source index of the |
michael@0 | 3990 | // _following_ character. |
michael@0 | 3991 | int32_t secondIx = firstCEI->highIndex; |
michael@0 | 3992 | if (mStart == secondIx) { |
michael@0 | 3993 | found = FALSE; |
michael@0 | 3994 | } |
michael@0 | 3995 | |
michael@0 | 3996 | // Advance the match end position to the first acceptable match boundary. |
michael@0 | 3997 | // This advances the index over any combining charcters. |
michael@0 | 3998 | mLimit = maxLimit; |
michael@0 | 3999 | if (minLimit < maxLimit) { |
michael@0 | 4000 | // When the last CE's low index is same with its high index, the CE is likely |
michael@0 | 4001 | // a part of expansion. In this case, the index is located just after the |
michael@0 | 4002 | // character corresponding to the CEs compared above. If the index is right |
michael@0 | 4003 | // at the break boundary, move the position to the next boundary will result |
michael@0 | 4004 | // incorrect match length when there are ignorable characters exist between |
michael@0 | 4005 | // the position and the next character produces CE(s). See ticket#8482. |
michael@0 | 4006 | if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { |
michael@0 | 4007 | mLimit = minLimit; |
michael@0 | 4008 | } else { |
michael@0 | 4009 | int32_t nba = nextBoundaryAfter(strsrch, minLimit); |
michael@0 | 4010 | if (nba >= lastCEI->highIndex) { |
michael@0 | 4011 | mLimit = nba; |
michael@0 | 4012 | } |
michael@0 | 4013 | } |
michael@0 | 4014 | } |
michael@0 | 4015 | |
michael@0 | 4016 | #ifdef USEARCH_DEBUG |
michael@0 | 4017 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 4018 | printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); |
michael@0 | 4019 | } |
michael@0 | 4020 | #endif |
michael@0 | 4021 | |
michael@0 | 4022 | // If advancing to the end of a combining sequence in character indexing space |
michael@0 | 4023 | // advanced us beyond the end of the match in CE space, reject this match. |
michael@0 | 4024 | if (mLimit > maxLimit) { |
michael@0 | 4025 | found = FALSE; |
michael@0 | 4026 | } |
michael@0 | 4027 | |
michael@0 | 4028 | if (!isBreakBoundary(strsrch, mLimit)) { |
michael@0 | 4029 | found = FALSE; |
michael@0 | 4030 | } |
michael@0 | 4031 | |
michael@0 | 4032 | if (! checkIdentical(strsrch, mStart, mLimit)) { |
michael@0 | 4033 | found = FALSE; |
michael@0 | 4034 | } |
michael@0 | 4035 | |
michael@0 | 4036 | if (found) { |
michael@0 | 4037 | break; |
michael@0 | 4038 | } |
michael@0 | 4039 | } |
michael@0 | 4040 | |
michael@0 | 4041 | #ifdef USEARCH_DEBUG |
michael@0 | 4042 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 4043 | printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); |
michael@0 | 4044 | int32_t lastToPrint = ceb.limitIx+2; |
michael@0 | 4045 | for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { |
michael@0 | 4046 | printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); |
michael@0 | 4047 | } |
michael@0 | 4048 | printf("\n%s\n", found? "match found" : "no match"); |
michael@0 | 4049 | } |
michael@0 | 4050 | #endif |
michael@0 | 4051 | |
michael@0 | 4052 | // All Done. Store back the match bounds to the caller. |
michael@0 | 4053 | // |
michael@0 | 4054 | if (found==FALSE) { |
michael@0 | 4055 | mLimit = -1; |
michael@0 | 4056 | mStart = -1; |
michael@0 | 4057 | } |
michael@0 | 4058 | |
michael@0 | 4059 | if (matchStart != NULL) { |
michael@0 | 4060 | *matchStart= mStart; |
michael@0 | 4061 | } |
michael@0 | 4062 | |
michael@0 | 4063 | if (matchLimit != NULL) { |
michael@0 | 4064 | *matchLimit = mLimit; |
michael@0 | 4065 | } |
michael@0 | 4066 | |
michael@0 | 4067 | return found; |
michael@0 | 4068 | } |
michael@0 | 4069 | |
michael@0 | 4070 | U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, |
michael@0 | 4071 | int32_t startIdx, |
michael@0 | 4072 | int32_t *matchStart, |
michael@0 | 4073 | int32_t *matchLimit, |
michael@0 | 4074 | UErrorCode *status) |
michael@0 | 4075 | { |
michael@0 | 4076 | if (U_FAILURE(*status)) { |
michael@0 | 4077 | return FALSE; |
michael@0 | 4078 | } |
michael@0 | 4079 | |
michael@0 | 4080 | // TODO: reject search patterns beginning with a combining char. |
michael@0 | 4081 | |
michael@0 | 4082 | #ifdef USEARCH_DEBUG |
michael@0 | 4083 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 4084 | printf("Pattern CEs\n"); |
michael@0 | 4085 | for (int ii=0; ii<strsrch->pattern.CELength; ii++) { |
michael@0 | 4086 | printf(" %8x", strsrch->pattern.CE[ii]); |
michael@0 | 4087 | } |
michael@0 | 4088 | printf("\n"); |
michael@0 | 4089 | } |
michael@0 | 4090 | |
michael@0 | 4091 | #endif |
michael@0 | 4092 | // Input parameter sanity check. |
michael@0 | 4093 | // TODO: should input indicies clip to the text length |
michael@0 | 4094 | // in the same way that UText does. |
michael@0 | 4095 | if(strsrch->pattern.CELength == 0 || |
michael@0 | 4096 | startIdx < 0 || |
michael@0 | 4097 | startIdx > strsrch->search->textLength || |
michael@0 | 4098 | strsrch->pattern.CE == NULL) { |
michael@0 | 4099 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 4100 | return FALSE; |
michael@0 | 4101 | } |
michael@0 | 4102 | |
michael@0 | 4103 | if (strsrch->pattern.PCE == NULL) { |
michael@0 | 4104 | initializePatternPCETable(strsrch, status); |
michael@0 | 4105 | } |
michael@0 | 4106 | |
michael@0 | 4107 | CEBuffer ceb(strsrch, status); |
michael@0 | 4108 | int32_t targetIx = 0; |
michael@0 | 4109 | |
michael@0 | 4110 | /* |
michael@0 | 4111 | * Pre-load the buffer with the CE's for the grapheme |
michael@0 | 4112 | * after our starting position so that we're sure that |
michael@0 | 4113 | * we can look at the CE following the match when we |
michael@0 | 4114 | * check the match boundaries. |
michael@0 | 4115 | * |
michael@0 | 4116 | * This will also pre-fetch the first CE that we'll |
michael@0 | 4117 | * consider for the match. |
michael@0 | 4118 | */ |
michael@0 | 4119 | if (startIdx < strsrch->search->textLength) { |
michael@0 | 4120 | UBreakIterator *bi = strsrch->search->internalBreakIter; |
michael@0 | 4121 | int32_t next = ubrk_following(bi, startIdx); |
michael@0 | 4122 | |
michael@0 | 4123 | ucol_setOffset(strsrch->textIter, next, status); |
michael@0 | 4124 | |
michael@0 | 4125 | for (targetIx = 0; ; targetIx += 1) { |
michael@0 | 4126 | if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { |
michael@0 | 4127 | break; |
michael@0 | 4128 | } |
michael@0 | 4129 | } |
michael@0 | 4130 | } else { |
michael@0 | 4131 | ucol_setOffset(strsrch->textIter, startIdx, status); |
michael@0 | 4132 | } |
michael@0 | 4133 | |
michael@0 | 4134 | |
michael@0 | 4135 | const CEI *targetCEI = NULL; |
michael@0 | 4136 | int32_t patIx; |
michael@0 | 4137 | UBool found; |
michael@0 | 4138 | |
michael@0 | 4139 | int32_t limitIx = targetIx; |
michael@0 | 4140 | int32_t mStart = -1; |
michael@0 | 4141 | int32_t mLimit = -1; |
michael@0 | 4142 | int32_t minLimit; |
michael@0 | 4143 | int32_t maxLimit; |
michael@0 | 4144 | |
michael@0 | 4145 | |
michael@0 | 4146 | |
michael@0 | 4147 | // Outer loop moves over match starting positions in the |
michael@0 | 4148 | // target CE space. |
michael@0 | 4149 | // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). |
michael@0 | 4150 | // But patIx is 0 at the beginning of the pattern and increases toward the end. |
michael@0 | 4151 | // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern |
michael@0 | 4152 | // and the beginning of the base text. |
michael@0 | 4153 | for(targetIx = limitIx; ; targetIx += 1) |
michael@0 | 4154 | { |
michael@0 | 4155 | found = TRUE; |
michael@0 | 4156 | // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer |
michael@0 | 4157 | // (compared to the last CE fetched for the previous targetIx value) as we need to go |
michael@0 | 4158 | // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. |
michael@0 | 4159 | const CEI *lastCEI = ceb.getPrevious(targetIx); |
michael@0 | 4160 | if (lastCEI == NULL) { |
michael@0 | 4161 | *status = U_INTERNAL_PROGRAM_ERROR; |
michael@0 | 4162 | found = FALSE; |
michael@0 | 4163 | break; |
michael@0 | 4164 | } |
michael@0 | 4165 | // Inner loop checks for a match beginning at each |
michael@0 | 4166 | // position from the outer loop. |
michael@0 | 4167 | int32_t targetIxOffset = 0; |
michael@0 | 4168 | for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { |
michael@0 | 4169 | int64_t patCE = strsrch->pattern.PCE[patIx]; |
michael@0 | 4170 | |
michael@0 | 4171 | targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset); |
michael@0 | 4172 | // Compare CE from target string with CE from the pattern. |
michael@0 | 4173 | // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, |
michael@0 | 4174 | // which will fail the compare, below. |
michael@0 | 4175 | UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); |
michael@0 | 4176 | if ( ceMatch == U_CE_NO_MATCH ) { |
michael@0 | 4177 | found = FALSE; |
michael@0 | 4178 | break; |
michael@0 | 4179 | } else if ( ceMatch > U_CE_NO_MATCH ) { |
michael@0 | 4180 | if ( ceMatch == U_CE_SKIP_TARG ) { |
michael@0 | 4181 | // redo with same patCE, next targCE |
michael@0 | 4182 | patIx++; |
michael@0 | 4183 | targetIxOffset++; |
michael@0 | 4184 | } else { // ceMatch == U_CE_SKIP_PATN |
michael@0 | 4185 | // redo with same targCE, next patCE |
michael@0 | 4186 | targetIxOffset--; |
michael@0 | 4187 | } |
michael@0 | 4188 | } |
michael@0 | 4189 | } |
michael@0 | 4190 | |
michael@0 | 4191 | if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { |
michael@0 | 4192 | // No match at this targetIx. Try again at the next. |
michael@0 | 4193 | continue; |
michael@0 | 4194 | } |
michael@0 | 4195 | |
michael@0 | 4196 | if (!found) { |
michael@0 | 4197 | // No match at all, we have run off the end of the target text. |
michael@0 | 4198 | break; |
michael@0 | 4199 | } |
michael@0 | 4200 | |
michael@0 | 4201 | |
michael@0 | 4202 | // We have found a match in CE space. |
michael@0 | 4203 | // Now determine the bounds in string index space. |
michael@0 | 4204 | // There still is a chance of match failure if the CE range not correspond to |
michael@0 | 4205 | // an acceptable character range. |
michael@0 | 4206 | // |
michael@0 | 4207 | const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset); |
michael@0 | 4208 | mStart = firstCEI->lowIndex; |
michael@0 | 4209 | |
michael@0 | 4210 | // Check for the start of the match being within a combining sequence. |
michael@0 | 4211 | // This can happen if the pattern itself begins with a combining char, and |
michael@0 | 4212 | // the match found combining marks in the target text that were attached |
michael@0 | 4213 | // to something else. |
michael@0 | 4214 | // This type of match should be rejected for not completely consuming a |
michael@0 | 4215 | // combining sequence. |
michael@0 | 4216 | if (!isBreakBoundary(strsrch, mStart)) { |
michael@0 | 4217 | found = FALSE; |
michael@0 | 4218 | } |
michael@0 | 4219 | |
michael@0 | 4220 | // Look at the high index of the first CE in the match. If it's the same as the |
michael@0 | 4221 | // low index, the first CE in the match is in the middle of an expansion. |
michael@0 | 4222 | if (mStart == firstCEI->highIndex) { |
michael@0 | 4223 | found = FALSE; |
michael@0 | 4224 | } |
michael@0 | 4225 | |
michael@0 | 4226 | |
michael@0 | 4227 | minLimit = lastCEI->lowIndex; |
michael@0 | 4228 | |
michael@0 | 4229 | if (targetIx > 0) { |
michael@0 | 4230 | // Look at the CE following the match. If it is UCOL_NULLORDER the match |
michael@0 | 4231 | // extended to the end of input, and the match is good. |
michael@0 | 4232 | |
michael@0 | 4233 | // Look at the high and low indices of the CE following the match. If |
michael@0 | 4234 | // they are the same it means one of two things: |
michael@0 | 4235 | // 1. The match extended to the last CE from the target text, which is OK, or |
michael@0 | 4236 | // 2. The last CE that was part of the match is in an expansion that extends |
michael@0 | 4237 | // to the first CE after the match. In this case, we reject the match. |
michael@0 | 4238 | const CEI *nextCEI = ceb.getPrevious(targetIx - 1); |
michael@0 | 4239 | |
michael@0 | 4240 | if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { |
michael@0 | 4241 | found = FALSE; |
michael@0 | 4242 | } |
michael@0 | 4243 | |
michael@0 | 4244 | mLimit = maxLimit = nextCEI->lowIndex; |
michael@0 | 4245 | |
michael@0 | 4246 | // Advance the match end position to the first acceptable match boundary. |
michael@0 | 4247 | // This advances the index over any combining charcters. |
michael@0 | 4248 | if (minLimit < maxLimit) { |
michael@0 | 4249 | int32_t nba = nextBoundaryAfter(strsrch, minLimit); |
michael@0 | 4250 | |
michael@0 | 4251 | if (nba >= lastCEI->highIndex) { |
michael@0 | 4252 | mLimit = nba; |
michael@0 | 4253 | } |
michael@0 | 4254 | } |
michael@0 | 4255 | |
michael@0 | 4256 | // If advancing to the end of a combining sequence in character indexing space |
michael@0 | 4257 | // advanced us beyond the end of the match in CE space, reject this match. |
michael@0 | 4258 | if (mLimit > maxLimit) { |
michael@0 | 4259 | found = FALSE; |
michael@0 | 4260 | } |
michael@0 | 4261 | |
michael@0 | 4262 | // Make sure the end of the match is on a break boundary |
michael@0 | 4263 | if (!isBreakBoundary(strsrch, mLimit)) { |
michael@0 | 4264 | found = FALSE; |
michael@0 | 4265 | } |
michael@0 | 4266 | |
michael@0 | 4267 | } else { |
michael@0 | 4268 | // No non-ignorable CEs after this point. |
michael@0 | 4269 | // The maximum position is detected by boundary after |
michael@0 | 4270 | // the last non-ignorable CE. Combining sequence |
michael@0 | 4271 | // across the start index will be truncated. |
michael@0 | 4272 | int32_t nba = nextBoundaryAfter(strsrch, minLimit); |
michael@0 | 4273 | mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; |
michael@0 | 4274 | } |
michael@0 | 4275 | |
michael@0 | 4276 | #ifdef USEARCH_DEBUG |
michael@0 | 4277 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 4278 | printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); |
michael@0 | 4279 | } |
michael@0 | 4280 | #endif |
michael@0 | 4281 | |
michael@0 | 4282 | |
michael@0 | 4283 | if (! checkIdentical(strsrch, mStart, mLimit)) { |
michael@0 | 4284 | found = FALSE; |
michael@0 | 4285 | } |
michael@0 | 4286 | |
michael@0 | 4287 | if (found) { |
michael@0 | 4288 | break; |
michael@0 | 4289 | } |
michael@0 | 4290 | } |
michael@0 | 4291 | |
michael@0 | 4292 | #ifdef USEARCH_DEBUG |
michael@0 | 4293 | if (getenv("USEARCH_DEBUG") != NULL) { |
michael@0 | 4294 | printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); |
michael@0 | 4295 | int32_t lastToPrint = ceb.limitIx+2; |
michael@0 | 4296 | for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { |
michael@0 | 4297 | printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); |
michael@0 | 4298 | } |
michael@0 | 4299 | printf("\n%s\n", found? "match found" : "no match"); |
michael@0 | 4300 | } |
michael@0 | 4301 | #endif |
michael@0 | 4302 | |
michael@0 | 4303 | // All Done. Store back the match bounds to the caller. |
michael@0 | 4304 | // |
michael@0 | 4305 | if (found==FALSE) { |
michael@0 | 4306 | mLimit = -1; |
michael@0 | 4307 | mStart = -1; |
michael@0 | 4308 | } |
michael@0 | 4309 | |
michael@0 | 4310 | if (matchStart != NULL) { |
michael@0 | 4311 | *matchStart= mStart; |
michael@0 | 4312 | } |
michael@0 | 4313 | |
michael@0 | 4314 | if (matchLimit != NULL) { |
michael@0 | 4315 | *matchLimit = mLimit; |
michael@0 | 4316 | } |
michael@0 | 4317 | |
michael@0 | 4318 | return found; |
michael@0 | 4319 | } |
michael@0 | 4320 | |
michael@0 | 4321 | // internal use methods declared in usrchimp.h ----------------------------- |
michael@0 | 4322 | |
michael@0 | 4323 | UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) |
michael@0 | 4324 | { |
michael@0 | 4325 | if (U_FAILURE(*status)) { |
michael@0 | 4326 | setMatchNotFound(strsrch); |
michael@0 | 4327 | return FALSE; |
michael@0 | 4328 | } |
michael@0 | 4329 | |
michael@0 | 4330 | #if BOYER_MOORE |
michael@0 | 4331 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 4332 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 4333 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 4334 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 4335 | int32_t textoffset = ucol_getOffset(coleiter); |
michael@0 | 4336 | |
michael@0 | 4337 | // status used in setting coleiter offset, since offset is checked in |
michael@0 | 4338 | // shiftForward before setting the coleiter offset, status never |
michael@0 | 4339 | // a failure |
michael@0 | 4340 | textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, |
michael@0 | 4341 | patterncelength); |
michael@0 | 4342 | while (textoffset <= textlength) |
michael@0 | 4343 | { |
michael@0 | 4344 | uint32_t patternceindex = patterncelength - 1; |
michael@0 | 4345 | int32_t targetce; |
michael@0 | 4346 | UBool found = FALSE; |
michael@0 | 4347 | int32_t lastce = UCOL_NULLORDER; |
michael@0 | 4348 | |
michael@0 | 4349 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4350 | |
michael@0 | 4351 | for (;;) { |
michael@0 | 4352 | // finding the last pattern ce match, imagine composite characters |
michael@0 | 4353 | // for example: search for pattern A in text \u00C0 |
michael@0 | 4354 | // we'll have to skip \u0300 the grave first before we get to A |
michael@0 | 4355 | targetce = ucol_previous(coleiter, status); |
michael@0 | 4356 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4357 | found = FALSE; |
michael@0 | 4358 | break; |
michael@0 | 4359 | } |
michael@0 | 4360 | targetce = getCE(strsrch, targetce); |
michael@0 | 4361 | if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { |
michael@0 | 4362 | // this is for the text \u0315\u0300 that requires |
michael@0 | 4363 | // normalization and pattern \u0300, where \u0315 is ignorable |
michael@0 | 4364 | continue; |
michael@0 | 4365 | } |
michael@0 | 4366 | if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { |
michael@0 | 4367 | lastce = targetce; |
michael@0 | 4368 | } |
michael@0 | 4369 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4370 | if (targetce == patternce[patternceindex]) { |
michael@0 | 4371 | // the first ce can be a contraction |
michael@0 | 4372 | found = TRUE; |
michael@0 | 4373 | break; |
michael@0 | 4374 | } |
michael@0 | 4375 | if (!hasExpansion(coleiter)) { |
michael@0 | 4376 | found = FALSE; |
michael@0 | 4377 | break; |
michael@0 | 4378 | } |
michael@0 | 4379 | } |
michael@0 | 4380 | |
michael@0 | 4381 | //targetce = lastce; |
michael@0 | 4382 | |
michael@0 | 4383 | while (found && patternceindex > 0) { |
michael@0 | 4384 | lastce = targetce; |
michael@0 | 4385 | targetce = ucol_previous(coleiter, status); |
michael@0 | 4386 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4387 | found = FALSE; |
michael@0 | 4388 | break; |
michael@0 | 4389 | } |
michael@0 | 4390 | targetce = getCE(strsrch, targetce); |
michael@0 | 4391 | if (targetce == UCOL_IGNORABLE) { |
michael@0 | 4392 | continue; |
michael@0 | 4393 | } |
michael@0 | 4394 | |
michael@0 | 4395 | patternceindex --; |
michael@0 | 4396 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4397 | found = found && targetce == patternce[patternceindex]; |
michael@0 | 4398 | } |
michael@0 | 4399 | |
michael@0 | 4400 | targetce = lastce; |
michael@0 | 4401 | |
michael@0 | 4402 | if (!found) { |
michael@0 | 4403 | if (U_FAILURE(*status)) { |
michael@0 | 4404 | break; |
michael@0 | 4405 | } |
michael@0 | 4406 | textoffset = shiftForward(strsrch, textoffset, lastce, |
michael@0 | 4407 | patternceindex); |
michael@0 | 4408 | // status checked at loop. |
michael@0 | 4409 | patternceindex = patterncelength; |
michael@0 | 4410 | continue; |
michael@0 | 4411 | } |
michael@0 | 4412 | |
michael@0 | 4413 | if (checkNextExactMatch(strsrch, &textoffset, status)) { |
michael@0 | 4414 | // status checked in ucol_setOffset |
michael@0 | 4415 | setColEIterOffset(coleiter, strsrch->search->matchedIndex); |
michael@0 | 4416 | return TRUE; |
michael@0 | 4417 | } |
michael@0 | 4418 | } |
michael@0 | 4419 | setMatchNotFound(strsrch); |
michael@0 | 4420 | return FALSE; |
michael@0 | 4421 | #else |
michael@0 | 4422 | int32_t textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4423 | int32_t start = -1; |
michael@0 | 4424 | int32_t end = -1; |
michael@0 | 4425 | |
michael@0 | 4426 | if (usearch_search(strsrch, textOffset, &start, &end, status)) { |
michael@0 | 4427 | strsrch->search->matchedIndex = start; |
michael@0 | 4428 | strsrch->search->matchedLength = end - start; |
michael@0 | 4429 | return TRUE; |
michael@0 | 4430 | } else { |
michael@0 | 4431 | setMatchNotFound(strsrch); |
michael@0 | 4432 | return FALSE; |
michael@0 | 4433 | } |
michael@0 | 4434 | #endif |
michael@0 | 4435 | } |
michael@0 | 4436 | |
michael@0 | 4437 | UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) |
michael@0 | 4438 | { |
michael@0 | 4439 | if (U_FAILURE(*status)) { |
michael@0 | 4440 | setMatchNotFound(strsrch); |
michael@0 | 4441 | return FALSE; |
michael@0 | 4442 | } |
michael@0 | 4443 | |
michael@0 | 4444 | #if BOYER_MOORE |
michael@0 | 4445 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 4446 | int32_t textlength = strsrch->search->textLength; |
michael@0 | 4447 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 4448 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 4449 | int32_t textoffset = ucol_getOffset(coleiter); |
michael@0 | 4450 | UBool hasPatternAccents = |
michael@0 | 4451 | strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; |
michael@0 | 4452 | |
michael@0 | 4453 | textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, |
michael@0 | 4454 | patterncelength); |
michael@0 | 4455 | strsrch->canonicalPrefixAccents[0] = 0; |
michael@0 | 4456 | strsrch->canonicalSuffixAccents[0] = 0; |
michael@0 | 4457 | |
michael@0 | 4458 | while (textoffset <= textlength) |
michael@0 | 4459 | { |
michael@0 | 4460 | int32_t patternceindex = patterncelength - 1; |
michael@0 | 4461 | int32_t targetce; |
michael@0 | 4462 | UBool found = FALSE; |
michael@0 | 4463 | int32_t lastce = UCOL_NULLORDER; |
michael@0 | 4464 | |
michael@0 | 4465 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4466 | |
michael@0 | 4467 | for (;;) { |
michael@0 | 4468 | // finding the last pattern ce match, imagine composite characters |
michael@0 | 4469 | // for example: search for pattern A in text \u00C0 |
michael@0 | 4470 | // we'll have to skip \u0300 the grave first before we get to A |
michael@0 | 4471 | targetce = ucol_previous(coleiter, status); |
michael@0 | 4472 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4473 | found = FALSE; |
michael@0 | 4474 | break; |
michael@0 | 4475 | } |
michael@0 | 4476 | targetce = getCE(strsrch, targetce); |
michael@0 | 4477 | if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { |
michael@0 | 4478 | lastce = targetce; |
michael@0 | 4479 | } |
michael@0 | 4480 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4481 | if (targetce == patternce[patternceindex]) { |
michael@0 | 4482 | // the first ce can be a contraction |
michael@0 | 4483 | found = TRUE; |
michael@0 | 4484 | break; |
michael@0 | 4485 | } |
michael@0 | 4486 | if (!hasExpansion(coleiter)) { |
michael@0 | 4487 | found = FALSE; |
michael@0 | 4488 | break; |
michael@0 | 4489 | } |
michael@0 | 4490 | } |
michael@0 | 4491 | |
michael@0 | 4492 | while (found && patternceindex > 0) { |
michael@0 | 4493 | targetce = ucol_previous(coleiter, status); |
michael@0 | 4494 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4495 | found = FALSE; |
michael@0 | 4496 | break; |
michael@0 | 4497 | } |
michael@0 | 4498 | targetce = getCE(strsrch, targetce); |
michael@0 | 4499 | if (targetce == UCOL_IGNORABLE) { |
michael@0 | 4500 | continue; |
michael@0 | 4501 | } |
michael@0 | 4502 | |
michael@0 | 4503 | patternceindex --; |
michael@0 | 4504 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4505 | found = found && targetce == patternce[patternceindex]; |
michael@0 | 4506 | } |
michael@0 | 4507 | |
michael@0 | 4508 | // initializing the rearranged accent array |
michael@0 | 4509 | if (hasPatternAccents && !found) { |
michael@0 | 4510 | strsrch->canonicalPrefixAccents[0] = 0; |
michael@0 | 4511 | strsrch->canonicalSuffixAccents[0] = 0; |
michael@0 | 4512 | if (U_FAILURE(*status)) { |
michael@0 | 4513 | break; |
michael@0 | 4514 | } |
michael@0 | 4515 | found = doNextCanonicalMatch(strsrch, textoffset, status); |
michael@0 | 4516 | } |
michael@0 | 4517 | |
michael@0 | 4518 | if (!found) { |
michael@0 | 4519 | if (U_FAILURE(*status)) { |
michael@0 | 4520 | break; |
michael@0 | 4521 | } |
michael@0 | 4522 | textoffset = shiftForward(strsrch, textoffset, lastce, |
michael@0 | 4523 | patternceindex); |
michael@0 | 4524 | // status checked at loop |
michael@0 | 4525 | patternceindex = patterncelength; |
michael@0 | 4526 | continue; |
michael@0 | 4527 | } |
michael@0 | 4528 | |
michael@0 | 4529 | if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { |
michael@0 | 4530 | setColEIterOffset(coleiter, strsrch->search->matchedIndex); |
michael@0 | 4531 | return TRUE; |
michael@0 | 4532 | } |
michael@0 | 4533 | } |
michael@0 | 4534 | setMatchNotFound(strsrch); |
michael@0 | 4535 | return FALSE; |
michael@0 | 4536 | #else |
michael@0 | 4537 | int32_t textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4538 | int32_t start = -1; |
michael@0 | 4539 | int32_t end = -1; |
michael@0 | 4540 | |
michael@0 | 4541 | if (usearch_search(strsrch, textOffset, &start, &end, status)) { |
michael@0 | 4542 | strsrch->search->matchedIndex = start; |
michael@0 | 4543 | strsrch->search->matchedLength = end - start; |
michael@0 | 4544 | return TRUE; |
michael@0 | 4545 | } else { |
michael@0 | 4546 | setMatchNotFound(strsrch); |
michael@0 | 4547 | return FALSE; |
michael@0 | 4548 | } |
michael@0 | 4549 | #endif |
michael@0 | 4550 | } |
michael@0 | 4551 | |
michael@0 | 4552 | UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) |
michael@0 | 4553 | { |
michael@0 | 4554 | if (U_FAILURE(*status)) { |
michael@0 | 4555 | setMatchNotFound(strsrch); |
michael@0 | 4556 | return FALSE; |
michael@0 | 4557 | } |
michael@0 | 4558 | |
michael@0 | 4559 | #if BOYER_MOORE |
michael@0 | 4560 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 4561 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 4562 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 4563 | int32_t textoffset = ucol_getOffset(coleiter); |
michael@0 | 4564 | |
michael@0 | 4565 | // shifting it check for setting offset |
michael@0 | 4566 | // if setOffset is called previously or there was no previous match, we |
michael@0 | 4567 | // leave the offset as it is. |
michael@0 | 4568 | if (strsrch->search->matchedIndex != USEARCH_DONE) { |
michael@0 | 4569 | textoffset = strsrch->search->matchedIndex; |
michael@0 | 4570 | } |
michael@0 | 4571 | |
michael@0 | 4572 | textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, |
michael@0 | 4573 | patterncelength); |
michael@0 | 4574 | |
michael@0 | 4575 | while (textoffset >= 0) |
michael@0 | 4576 | { |
michael@0 | 4577 | int32_t patternceindex = 1; |
michael@0 | 4578 | int32_t targetce; |
michael@0 | 4579 | UBool found = FALSE; |
michael@0 | 4580 | int32_t firstce = UCOL_NULLORDER; |
michael@0 | 4581 | |
michael@0 | 4582 | // if status is a failure, ucol_setOffset does nothing |
michael@0 | 4583 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4584 | |
michael@0 | 4585 | for (;;) { |
michael@0 | 4586 | // finding the first pattern ce match, imagine composite |
michael@0 | 4587 | // characters. for example: search for pattern \u0300 in text |
michael@0 | 4588 | // \u00C0, we'll have to skip A first before we get to |
michael@0 | 4589 | // \u0300 the grave accent |
michael@0 | 4590 | targetce = ucol_next(coleiter, status); |
michael@0 | 4591 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4592 | found = FALSE; |
michael@0 | 4593 | break; |
michael@0 | 4594 | } |
michael@0 | 4595 | targetce = getCE(strsrch, targetce); |
michael@0 | 4596 | if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { |
michael@0 | 4597 | firstce = targetce; |
michael@0 | 4598 | } |
michael@0 | 4599 | if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { |
michael@0 | 4600 | continue; |
michael@0 | 4601 | } |
michael@0 | 4602 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4603 | if (targetce == patternce[0]) { |
michael@0 | 4604 | found = TRUE; |
michael@0 | 4605 | break; |
michael@0 | 4606 | } |
michael@0 | 4607 | if (!hasExpansion(coleiter)) { |
michael@0 | 4608 | // checking for accents in composite character |
michael@0 | 4609 | found = FALSE; |
michael@0 | 4610 | break; |
michael@0 | 4611 | } |
michael@0 | 4612 | } |
michael@0 | 4613 | |
michael@0 | 4614 | //targetce = firstce; |
michael@0 | 4615 | |
michael@0 | 4616 | while (found && (patternceindex < patterncelength)) { |
michael@0 | 4617 | firstce = targetce; |
michael@0 | 4618 | targetce = ucol_next(coleiter, status); |
michael@0 | 4619 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4620 | found = FALSE; |
michael@0 | 4621 | break; |
michael@0 | 4622 | } |
michael@0 | 4623 | targetce = getCE(strsrch, targetce); |
michael@0 | 4624 | if (targetce == UCOL_IGNORABLE) { |
michael@0 | 4625 | continue; |
michael@0 | 4626 | } |
michael@0 | 4627 | |
michael@0 | 4628 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4629 | found = found && targetce == patternce[patternceindex]; |
michael@0 | 4630 | patternceindex ++; |
michael@0 | 4631 | } |
michael@0 | 4632 | |
michael@0 | 4633 | targetce = firstce; |
michael@0 | 4634 | |
michael@0 | 4635 | if (!found) { |
michael@0 | 4636 | if (U_FAILURE(*status)) { |
michael@0 | 4637 | break; |
michael@0 | 4638 | } |
michael@0 | 4639 | |
michael@0 | 4640 | textoffset = reverseShift(strsrch, textoffset, targetce, |
michael@0 | 4641 | patternceindex); |
michael@0 | 4642 | patternceindex = 0; |
michael@0 | 4643 | continue; |
michael@0 | 4644 | } |
michael@0 | 4645 | |
michael@0 | 4646 | if (checkPreviousExactMatch(strsrch, &textoffset, status)) { |
michael@0 | 4647 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4648 | return TRUE; |
michael@0 | 4649 | } |
michael@0 | 4650 | } |
michael@0 | 4651 | setMatchNotFound(strsrch); |
michael@0 | 4652 | return FALSE; |
michael@0 | 4653 | #else |
michael@0 | 4654 | int32_t textOffset; |
michael@0 | 4655 | |
michael@0 | 4656 | if (strsrch->search->isOverlap) { |
michael@0 | 4657 | if (strsrch->search->matchedIndex != USEARCH_DONE) { |
michael@0 | 4658 | textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; |
michael@0 | 4659 | } else { |
michael@0 | 4660 | // move the start position at the end of possible match |
michael@0 | 4661 | initializePatternPCETable(strsrch, status); |
michael@0 | 4662 | for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { |
michael@0 | 4663 | int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); |
michael@0 | 4664 | if (pce == UCOL_PROCESSED_NULLORDER) { |
michael@0 | 4665 | // at the end of the text |
michael@0 | 4666 | break; |
michael@0 | 4667 | } |
michael@0 | 4668 | } |
michael@0 | 4669 | if (U_FAILURE(*status)) { |
michael@0 | 4670 | setMatchNotFound(strsrch); |
michael@0 | 4671 | return FALSE; |
michael@0 | 4672 | } |
michael@0 | 4673 | textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4674 | } |
michael@0 | 4675 | } else { |
michael@0 | 4676 | textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4677 | } |
michael@0 | 4678 | |
michael@0 | 4679 | int32_t start = -1; |
michael@0 | 4680 | int32_t end = -1; |
michael@0 | 4681 | |
michael@0 | 4682 | if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { |
michael@0 | 4683 | strsrch->search->matchedIndex = start; |
michael@0 | 4684 | strsrch->search->matchedLength = end - start; |
michael@0 | 4685 | return TRUE; |
michael@0 | 4686 | } else { |
michael@0 | 4687 | setMatchNotFound(strsrch); |
michael@0 | 4688 | return FALSE; |
michael@0 | 4689 | } |
michael@0 | 4690 | #endif |
michael@0 | 4691 | } |
michael@0 | 4692 | |
michael@0 | 4693 | UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, |
michael@0 | 4694 | UErrorCode *status) |
michael@0 | 4695 | { |
michael@0 | 4696 | if (U_FAILURE(*status)) { |
michael@0 | 4697 | setMatchNotFound(strsrch); |
michael@0 | 4698 | return FALSE; |
michael@0 | 4699 | } |
michael@0 | 4700 | |
michael@0 | 4701 | #if BOYER_MOORE |
michael@0 | 4702 | UCollationElements *coleiter = strsrch->textIter; |
michael@0 | 4703 | int32_t *patternce = strsrch->pattern.CE; |
michael@0 | 4704 | int32_t patterncelength = strsrch->pattern.CELength; |
michael@0 | 4705 | int32_t textoffset = ucol_getOffset(coleiter); |
michael@0 | 4706 | UBool hasPatternAccents = |
michael@0 | 4707 | strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; |
michael@0 | 4708 | |
michael@0 | 4709 | // shifting it check for setting offset |
michael@0 | 4710 | // if setOffset is called previously or there was no previous match, we |
michael@0 | 4711 | // leave the offset as it is. |
michael@0 | 4712 | if (strsrch->search->matchedIndex != USEARCH_DONE) { |
michael@0 | 4713 | textoffset = strsrch->search->matchedIndex; |
michael@0 | 4714 | } |
michael@0 | 4715 | |
michael@0 | 4716 | textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, |
michael@0 | 4717 | patterncelength); |
michael@0 | 4718 | strsrch->canonicalPrefixAccents[0] = 0; |
michael@0 | 4719 | strsrch->canonicalSuffixAccents[0] = 0; |
michael@0 | 4720 | |
michael@0 | 4721 | while (textoffset >= 0) |
michael@0 | 4722 | { |
michael@0 | 4723 | int32_t patternceindex = 1; |
michael@0 | 4724 | int32_t targetce; |
michael@0 | 4725 | UBool found = FALSE; |
michael@0 | 4726 | int32_t firstce = UCOL_NULLORDER; |
michael@0 | 4727 | |
michael@0 | 4728 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4729 | for (;;) { |
michael@0 | 4730 | // finding the first pattern ce match, imagine composite |
michael@0 | 4731 | // characters. for example: search for pattern \u0300 in text |
michael@0 | 4732 | // \u00C0, we'll have to skip A first before we get to |
michael@0 | 4733 | // \u0300 the grave accent |
michael@0 | 4734 | targetce = ucol_next(coleiter, status); |
michael@0 | 4735 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4736 | found = FALSE; |
michael@0 | 4737 | break; |
michael@0 | 4738 | } |
michael@0 | 4739 | targetce = getCE(strsrch, targetce); |
michael@0 | 4740 | if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { |
michael@0 | 4741 | firstce = targetce; |
michael@0 | 4742 | } |
michael@0 | 4743 | |
michael@0 | 4744 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4745 | if (targetce == patternce[0]) { |
michael@0 | 4746 | // the first ce can be a contraction |
michael@0 | 4747 | found = TRUE; |
michael@0 | 4748 | break; |
michael@0 | 4749 | } |
michael@0 | 4750 | if (!hasExpansion(coleiter)) { |
michael@0 | 4751 | // checking for accents in composite character |
michael@0 | 4752 | found = FALSE; |
michael@0 | 4753 | break; |
michael@0 | 4754 | } |
michael@0 | 4755 | } |
michael@0 | 4756 | |
michael@0 | 4757 | targetce = firstce; |
michael@0 | 4758 | |
michael@0 | 4759 | while (found && patternceindex < patterncelength) { |
michael@0 | 4760 | targetce = ucol_next(coleiter, status); |
michael@0 | 4761 | if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { |
michael@0 | 4762 | found = FALSE; |
michael@0 | 4763 | break; |
michael@0 | 4764 | } |
michael@0 | 4765 | targetce = getCE(strsrch, targetce); |
michael@0 | 4766 | if (targetce == UCOL_IGNORABLE) { |
michael@0 | 4767 | continue; |
michael@0 | 4768 | } |
michael@0 | 4769 | |
michael@0 | 4770 | // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s |
michael@0 | 4771 | found = found && targetce == patternce[patternceindex]; |
michael@0 | 4772 | patternceindex ++; |
michael@0 | 4773 | } |
michael@0 | 4774 | |
michael@0 | 4775 | // initializing the rearranged accent array |
michael@0 | 4776 | if (hasPatternAccents && !found) { |
michael@0 | 4777 | strsrch->canonicalPrefixAccents[0] = 0; |
michael@0 | 4778 | strsrch->canonicalSuffixAccents[0] = 0; |
michael@0 | 4779 | if (U_FAILURE(*status)) { |
michael@0 | 4780 | break; |
michael@0 | 4781 | } |
michael@0 | 4782 | found = doPreviousCanonicalMatch(strsrch, textoffset, status); |
michael@0 | 4783 | } |
michael@0 | 4784 | |
michael@0 | 4785 | if (!found) { |
michael@0 | 4786 | if (U_FAILURE(*status)) { |
michael@0 | 4787 | break; |
michael@0 | 4788 | } |
michael@0 | 4789 | textoffset = reverseShift(strsrch, textoffset, targetce, |
michael@0 | 4790 | patternceindex); |
michael@0 | 4791 | patternceindex = 0; |
michael@0 | 4792 | continue; |
michael@0 | 4793 | } |
michael@0 | 4794 | |
michael@0 | 4795 | if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { |
michael@0 | 4796 | setColEIterOffset(coleiter, textoffset); |
michael@0 | 4797 | return TRUE; |
michael@0 | 4798 | } |
michael@0 | 4799 | } |
michael@0 | 4800 | setMatchNotFound(strsrch); |
michael@0 | 4801 | return FALSE; |
michael@0 | 4802 | #else |
michael@0 | 4803 | int32_t textOffset; |
michael@0 | 4804 | |
michael@0 | 4805 | if (strsrch->search->isOverlap) { |
michael@0 | 4806 | if (strsrch->search->matchedIndex != USEARCH_DONE) { |
michael@0 | 4807 | textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; |
michael@0 | 4808 | } else { |
michael@0 | 4809 | // move the start position at the end of possible match |
michael@0 | 4810 | initializePatternPCETable(strsrch, status); |
michael@0 | 4811 | for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { |
michael@0 | 4812 | int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status); |
michael@0 | 4813 | if (pce == UCOL_PROCESSED_NULLORDER) { |
michael@0 | 4814 | // at the end of the text |
michael@0 | 4815 | break; |
michael@0 | 4816 | } |
michael@0 | 4817 | } |
michael@0 | 4818 | if (U_FAILURE(*status)) { |
michael@0 | 4819 | setMatchNotFound(strsrch); |
michael@0 | 4820 | return FALSE; |
michael@0 | 4821 | } |
michael@0 | 4822 | textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4823 | } |
michael@0 | 4824 | } else { |
michael@0 | 4825 | textOffset = ucol_getOffset(strsrch->textIter); |
michael@0 | 4826 | } |
michael@0 | 4827 | |
michael@0 | 4828 | int32_t start = -1; |
michael@0 | 4829 | int32_t end = -1; |
michael@0 | 4830 | |
michael@0 | 4831 | if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { |
michael@0 | 4832 | strsrch->search->matchedIndex = start; |
michael@0 | 4833 | strsrch->search->matchedLength = end - start; |
michael@0 | 4834 | return TRUE; |
michael@0 | 4835 | } else { |
michael@0 | 4836 | setMatchNotFound(strsrch); |
michael@0 | 4837 | return FALSE; |
michael@0 | 4838 | } |
michael@0 | 4839 | #endif |
michael@0 | 4840 | } |
michael@0 | 4841 | |
michael@0 | 4842 | #endif /* #if !UCONFIG_NO_COLLATION */ |