The Tor Browser: intl/icu/source/i18n/usearch.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.

     4 **********************************************************************

     5 *   Date        Name        Description

     6 *  07/02/2001   synwee      Creation.

     7 **********************************************************************

     8 */

    10 #include "unicode/utypes.h"

    12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

    14 #include "unicode/usearch.h"

    15 #include "unicode/ustring.h"

    16 #include "unicode/uchar.h"

    17 #include "unicode/utf16.h"

    18 #include "normalizer2impl.h"

    19 #include "ucol_imp.h"

    20 #include "usrchimp.h"

    21 #include "cmemory.h"

    22 #include "ucln_in.h"

    23 #include "uassert.h"

    24 #include "ustr_imp.h"

    26 U_NAMESPACE_USE

    28 // don't use Boyer-Moore

    29 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)

    30 #define BOYER_MOORE 0

    32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

    34 // internal definition ---------------------------------------------------

    36 #define LAST_BYTE_MASK_          0xFF

    37 #define SECOND_LAST_BYTE_SHIFT_  8

    38 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000

    40 static const Normalizer2Impl *g_nfcImpl = NULL;

    42 // internal methods -------------------------------------------------

    44 /**

    45 * Fast collation element iterator setOffset.

    46 * This function does not check for bounds.

    47 * @param coleiter collation element iterator

    48 * @param offset to set

    49 */

    50 static

    51 inline void setColEIterOffset(UCollationElements *elems,

    52                       int32_t             offset)

    53 {

    54     collIterate *ci = &(elems->iteratordata_);

    55     ci->pos         = ci->string + offset;

    56     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;

    57     if (ci->flags & UCOL_ITER_INNORMBUF) {

    58         ci->flags = ci->origFlags;

    59     }

    60     ci->fcdPosition = NULL;

    62     ci->offsetReturn = NULL;

    63     ci->offsetStore  = ci->offsetBuffer;

    64     ci->offsetRepeatCount = ci->offsetRepeatValue = 0;

    65 }

    67 /**

    68 * Getting the mask for collation strength

    69 * @param strength collation strength

    70 * @return collation element mask

    71 */

    72 static

    73 inline uint32_t getMask(UCollationStrength strength)

    74 {

    75     switch (strength)

    76     {

    77     case UCOL_PRIMARY:

    78         return UCOL_PRIMARYORDERMASK;

    79     case UCOL_SECONDARY:

    80         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;

    81     default:

    82         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |

    83                UCOL_PRIMARYORDERMASK;

    84     }

    85 }

    87 /**

    88 * This is to squeeze the 21bit ces into a 256 table

    89 * @param ce collation element

    90 * @return collapsed version of the collation element

    91 */

    92 static

    93 inline int hash(uint32_t ce)

    94 {

    95     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work

    96     // well with the new collation where most of the latin 1 characters

    97     // are of the value xx000xxx. their hashes will most of the time be 0

    98     // to be discussed on the hash algo.

    99     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;

   100 }

   102 U_CDECL_BEGIN

   103 static UBool U_CALLCONV

   104 usearch_cleanup(void) {

   105     g_nfcImpl = NULL;

   106     return TRUE;

   107 }

   108 U_CDECL_END

   110 /**

   111 * Initializing the fcd tables.

   112 * Internal method, status assumed to be a success.

   113 * @param status output error if any, caller to check status before calling

   114 *               method, status assumed to be success when passed in.

   115 */

   116 static

   117 inline void initializeFCD(UErrorCode *status)

   118 {

   119     if (g_nfcImpl == NULL) {

   120         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);

   121         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);

   122     }

   123 }

   125 /**

   126 * Gets the fcd value for a character at the argument index.

   127 * This method takes into accounts of the supplementary characters.

   128 * @param str UTF16 string where character for fcd retrieval resides

   129 * @param offset position of the character whose fcd is to be retrieved, to be

   130 *               overwritten with the next character position, taking

   131 *               surrogate characters into consideration.

   132 * @param strlength length of the argument string

   133 * @return fcd value

   134 */

   135 static

   136 uint16_t getFCD(const UChar   *str, int32_t *offset,

   137                              int32_t  strlength)

   138 {

   139     const UChar *temp = str + *offset;

   140     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);

   141     *offset = (int32_t)(temp - str);

   142     return result;

   143 }

   145 /**

   146 * Getting the modified collation elements taking into account the collation

   147 * attributes

   148 * @param strsrch string search data

   149 * @param sourcece

   150 * @return the modified collation element

   151 */

   152 static

   153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)

   154 {

   155     // note for tertiary we can't use the collator->tertiaryMask, that

   156     // is a preprocessed mask that takes into account case options. since

   157     // we are only concerned with exact matches, we don't need that.

   158     sourcece &= strsrch->ceMask;

   160     if (strsrch->toShift) {

   161         // alternate handling here, since only the 16 most significant digits

   162         // is only used, we can safely do a compare without masking

   163         // if the ce is a variable, we mask and get only the primary values

   164         // no shifting to quartenary is required since all primary values

   165         // less than variabletop will need to be masked off anyway.

   166         if (strsrch->variableTop > sourcece) {

   167             if (strsrch->strength >= UCOL_QUATERNARY) {

   168                 sourcece &= UCOL_PRIMARYORDERMASK;

   169             }

   170             else {

   171                 sourcece = UCOL_IGNORABLE;

   172             }

   173         }

   174     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {

   175         sourcece = 0xFFFF;

   176     }

   178     return sourcece;

   179 }

   181 /**

   182 * Allocate a memory and returns NULL if it failed.

   183 * Internal method, status assumed to be a success.

   184 * @param size to allocate

   185 * @param status output error if any, caller to check status before calling

   186 *               method, status assumed to be success when passed in.

   187 * @return newly allocated array, NULL otherwise

   188 */

   189 static

   190 inline void * allocateMemory(uint32_t size, UErrorCode *status)

   191 {

   192     uint32_t *result = (uint32_t *)uprv_malloc(size);

   193     if (result == NULL) {

   194         *status = U_MEMORY_ALLOCATION_ERROR;

   195     }

   196     return result;

   197 }

   199 /**

   200 * Adds a uint32_t value to a destination array.

   201 * Creates a new array if we run out of space. The caller will have to

   202 * manually deallocate the newly allocated array.

   203 * Internal method, status assumed to be success, caller has to check status

   204 * before calling this method. destination not to be NULL and has at least

   205 * size destinationlength.

   206 * @param destination target array

   207 * @param offset destination offset to add value

   208 * @param destinationlength target array size, return value for the new size

   209 * @param value to be added

   210 * @param increments incremental size expected

   211 * @param status output error if any, caller to check status before calling

   212 *               method, status assumed to be success when passed in.

   213 * @return new destination array, destination if there was no new allocation

   214 */

   215 static

   216 inline int32_t * addTouint32_tArray(int32_t    *destination,

   217                                     uint32_t    offset,

   218                                     uint32_t   *destinationlength,

   219                                     uint32_t    value,

   220                                     uint32_t    increments,

   221                                     UErrorCode *status)

   222 {

   223     uint32_t newlength = *destinationlength;

   224     if (offset + 1 == newlength) {

   225         newlength += increments;

   226         int32_t *temp = (int32_t *)allocateMemory(

   227                                          sizeof(int32_t) * newlength, status);

   228         if (U_FAILURE(*status)) {

   229             return NULL;

   230         }

   231         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);

   232         *destinationlength = newlength;

   233         destination        = temp;

   234     }

   235     destination[offset] = value;

   236     return destination;

   237 }

   239 /**

   240 * Adds a uint64_t value to a destination array.

   241 * Creates a new array if we run out of space. The caller will have to

   242 * manually deallocate the newly allocated array.

   243 * Internal method, status assumed to be success, caller has to check status

   244 * before calling this method. destination not to be NULL and has at least

   245 * size destinationlength.

   246 * @param destination target array

   247 * @param offset destination offset to add value

   248 * @param destinationlength target array size, return value for the new size

   249 * @param value to be added

   250 * @param increments incremental size expected

   251 * @param status output error if any, caller to check status before calling

   252 *               method, status assumed to be success when passed in.

   253 * @return new destination array, destination if there was no new allocation

   254 */

   255 static

   256 inline int64_t * addTouint64_tArray(int64_t    *destination,

   257                                     uint32_t    offset,

   258                                     uint32_t   *destinationlength,

   259                                     uint64_t    value,

   260                                     uint32_t    increments,

   261                                     UErrorCode *status)

   262 {

   263     uint32_t newlength = *destinationlength;

   264     if (offset + 1 == newlength) {

   265         newlength += increments;

   266         int64_t *temp = (int64_t *)allocateMemory(

   267                                          sizeof(int64_t) * newlength, status);

   269         if (U_FAILURE(*status)) {

   270             return NULL;

   271         }

   273         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);

   274         *destinationlength = newlength;

   275         destination        = temp;

   276     }

   278     destination[offset] = value;

   280     return destination;

   281 }

   283 /**

   284 * Initializing the ce table for a pattern.

   285 * Stores non-ignorable collation keys.

   286 * Table size will be estimated by the size of the pattern text. Table

   287 * expansion will be perform as we go along. Adding 1 to ensure that the table

   288 * size definitely increases.

   289 * Internal method, status assumed to be a success.

   290 * @param strsrch string search data

   291 * @param status output error if any, caller to check status before calling

   292 *               method, status assumed to be success when passed in.

   293 * @return total number of expansions

   294 */

   295 static

   296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,

   297                                          UErrorCode    *status)

   298 {

   299     UPattern *pattern            = &(strsrch->pattern);

   300     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;

   301     int32_t  *cetable            = pattern->CEBuffer;

   302     uint32_t  patternlength      = pattern->textLength;

   303     UCollationElements *coleiter = strsrch->utilIter;

   305     if (coleiter == NULL) {

   306         coleiter = ucol_openElements(strsrch->collator, pattern->text,

   307                                      patternlength, status);

   308         // status will be checked in ucol_next(..) later and if it is an

   309         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be

   310         // returned.

   311         strsrch->utilIter = coleiter;

   312     }

   313     else {

   314         uprv_init_collIterate(strsrch->collator, pattern->text,

   315                          pattern->textLength,

   316                          &coleiter->iteratordata_,

   317                          status);

   318     }

   319     if(U_FAILURE(*status)) {

   320         return 0;

   321     }

   323     if (pattern->CE != cetable && pattern->CE) {

   324         uprv_free(pattern->CE);

   325     }

   327     uint16_t  offset      = 0;

   328     uint16_t  result      = 0;

   329     int32_t   ce;

   331     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&

   332            U_SUCCESS(*status)) {

   333         uint32_t newce = getCE(strsrch, ce);

   334         if (newce) {

   335             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,

   336                                   newce,

   337                                   patternlength - ucol_getOffset(coleiter) + 1,

   338                                   status);

   339             if (U_FAILURE(*status)) {

   340                 return 0;

   341             }

   342             offset ++;

   343             if (cetable != temp && cetable != pattern->CEBuffer) {

   344                 uprv_free(cetable);

   345             }

   346             cetable = temp;

   347         }

   348         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);

   349     }

   351     cetable[offset]   = 0;

   352     pattern->CE       = cetable;

   353     pattern->CELength = offset;

   355     return result;

   356 }

   358 /**

   359 * Initializing the pce table for a pattern.

   360 * Stores non-ignorable collation keys.

   361 * Table size will be estimated by the size of the pattern text. Table

   362 * expansion will be perform as we go along. Adding 1 to ensure that the table

   363 * size definitely increases.

   364 * Internal method, status assumed to be a success.

   365 * @param strsrch string search data

   366 * @param status output error if any, caller to check status before calling

   367 *               method, status assumed to be success when passed in.

   368 * @return total number of expansions

   369 */

   370 static

   371 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,

   372                                           UErrorCode    *status)

   373 {

   374     UPattern *pattern            = &(strsrch->pattern);

   375     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;

   376     int64_t  *pcetable           = pattern->PCEBuffer;

   377     uint32_t  patternlength      = pattern->textLength;

   378     UCollationElements *coleiter = strsrch->utilIter;

   380     if (coleiter == NULL) {

   381         coleiter = ucol_openElements(strsrch->collator, pattern->text,

   382                                      patternlength, status);

   383         // status will be checked in ucol_next(..) later and if it is an

   384         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be

   385         // returned.

   386         strsrch->utilIter = coleiter;

   387     } else {

   388         uprv_init_collIterate(strsrch->collator, pattern->text,

   389                               pattern->textLength,

   390                               &coleiter->iteratordata_,

   391                               status);

   392     }

   393     if(U_FAILURE(*status)) {

   394         return 0;

   395     }

   397     if (pattern->PCE != pcetable && pattern->PCE != NULL) {

   398         uprv_free(pattern->PCE);

   399     }

   401     uint16_t  offset = 0;

   402     uint16_t  result = 0;

   403     int64_t   pce;

   405     uprv_init_pce(coleiter);

   407     // ** Should processed CEs be signed or unsigned?

   408     // ** (the rest of the code in this file seems to play fast-and-loose with

   409     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)

   410     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&

   411            U_SUCCESS(*status)) {

   412         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,

   413                               pce,

   414                               patternlength - ucol_getOffset(coleiter) + 1,

   415                               status);

   417         if (U_FAILURE(*status)) {

   418             return 0;

   419         }

   421         offset += 1;

   423         if (pcetable != temp && pcetable != pattern->PCEBuffer) {

   424             uprv_free(pcetable);

   425         }

   427         pcetable = temp;

   428         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);

   429     }

   431     pcetable[offset]   = 0;

   432     pattern->PCE       = pcetable;

   433     pattern->PCELength = offset;

   435     return result;

   436 }

   438 /**

   439 * Initializes the pattern struct.

   440 * Internal method, status assumed to be success.

   441 * @param strsrch UStringSearch data storage

   442 * @param status output error if any, caller to check status before calling

   443 *               method, status assumed to be success when passed in.

   444 * @return expansionsize the total expansion size of the pattern

   445 */

   446 static

   447 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)

   448 {

   449           UPattern   *pattern     = &(strsrch->pattern);

   450     const UChar      *patterntext = pattern->text;

   451           int32_t     length      = pattern->textLength;

   452           int32_t index       = 0;

   454     // Since the strength is primary, accents are ignored in the pattern.

   455     if (strsrch->strength == UCOL_PRIMARY) {

   456         pattern->hasPrefixAccents = 0;

   457         pattern->hasSuffixAccents = 0;

   458     } else {

   459         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>

   460                                                          SECOND_LAST_BYTE_SHIFT_;

   461         index = length;

   462         U16_BACK_1(patterntext, 0, index);

   463         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &

   464                                                                  LAST_BYTE_MASK_;

   465     }

   467     // ** HACK **

   468     if (strsrch->pattern.PCE != NULL) {

   469         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {

   470             uprv_free(strsrch->pattern.PCE);

   471         }

   473         strsrch->pattern.PCE = NULL;

   474     }

   476     // since intializePattern is an internal method status is a success.

   477     return initializePatternCETable(strsrch, status);

   478 }

   480 /**

   481 * Initializing shift tables, with the default values.

   482 * If a corresponding default value is 0, the shift table is not set.

   483 * @param shift table for forwards shift

   484 * @param backshift table for backwards shift

   485 * @param cetable table containing pattern ce

   486 * @param cesize size of the pattern ces

   487 * @param expansionsize total size of the expansions

   488 * @param defaultforward the default forward value

   489 * @param defaultbackward the default backward value

   490 */

   491 static

   492 inline void setShiftTable(int16_t   shift[], int16_t backshift[],

   493                           int32_t  *cetable, int32_t cesize,

   494                           int16_t   expansionsize,

   495                           int16_t   defaultforward,

   496                           int16_t   defaultbackward)

   497 {

   498     // estimate the value to shift. to do that we estimate the smallest

   499     // number of characters to give the relevant ces, ie approximately

   500     // the number of ces minus their expansion, since expansions can come

   501     // from a character.

   502     int32_t count;

   503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

   504         shift[count] = defaultforward;

   505     }

   506     cesize --; // down to the last index

   507     for (count = 0; count < cesize; count ++) {

   508         // number of ces from right of array to the count

   509         int temp = defaultforward - count - 1;

   510         shift[hash(cetable[count])] = temp > 1 ? temp : 1;

   511     }

   512     shift[hash(cetable[cesize])] = 1;

   513     // for ignorables we just shift by one. see test examples.

   514     shift[hash(0)] = 1;

   516     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {

   517         backshift[count] = defaultbackward;

   518     }

   519     for (count = cesize; count > 0; count --) {

   520         // the original value count does not seem to work

   521         backshift[hash(cetable[count])] = count > expansionsize ?

   522                                           (int16_t)(count - expansionsize) : 1;

   523     }

   524     backshift[hash(cetable[0])] = 1;

   525     backshift[hash(0)] = 1;

   526 }

   528 /**

   529 * Building of the pattern collation element list and the boyer moore strsrch

   530 * table.

   531 * The canonical match will only be performed after the default match fails.

   532 * For both cases we need to remember the size of the composed and decomposed

   533 * versions of the string. Since the Boyer-Moore shift calculations shifts by

   534 * a number of characters in the text and tries to match the pattern from that

   535 * offset, the shift value can not be too large in case we miss some

   536 * characters. To choose a right shift size, we estimate the NFC form of the

   537 * and use its size as a shift guide. The NFC form should be the small

   538 * possible representation of the pattern. Anyways, we'll err on the smaller

   539 * shift size. Hence the calculation for minlength.

   540 * Canonical match will be performed slightly differently. We'll split the

   541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by

   542 * the first and last base character (MS), the ending accents (EA). Matches

   543 * will be done on MS first, and only when we match MS then some processing

   544 * will be required for the prefix and end accents in order to determine if

   545 * they match PA and EA. Hence the default shift values

   546 * for the canonical match will take the size of either end's accent into

   547 * consideration. Forwards search will take the end accents into consideration

   548 * for the default shift values and the backwards search will take the prefix

   549 * accents into consideration.

   550 * If pattern has no non-ignorable ce, we return a illegal argument error.

   551 * Internal method, status assumed to be success.

   552 * @param strsrch UStringSearch data storage

   553 * @param status  for output errors if it occurs, status is assumed to be a

   554 *                success when it is passed in.

   555 */

   556 static

   557 inline void initialize(UStringSearch *strsrch, UErrorCode *status)

   558 {

   559     int16_t expandlength  = initializePattern(strsrch, status);

   560     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {

   561         UPattern *pattern = &strsrch->pattern;

   562         int32_t   cesize  = pattern->CELength;

   564         int16_t minlength = cesize > expandlength

   565                             ? (int16_t)cesize - expandlength : 1;

   566         pattern->defaultShiftSize    = minlength;

   567         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,

   568                       cesize, expandlength, minlength, minlength);

   569         return;

   570     }

   571     strsrch->pattern.defaultShiftSize = 0;

   572 }

   574 #if BOYER_MOORE

   575 /**

   576 * Check to make sure that the match length is at the end of the character by

   577 * using the breakiterator.

   578 * @param strsrch string search data

   579 * @param start target text start offset

   580 * @param end target text end offset

   581 */

   582 static

   583 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,

   584                                int32_t *end)

   585 {

   586 #if !UCONFIG_NO_BREAK_ITERATION

   587     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;

   588     if (breakiterator) {

   589         int32_t matchend = *end;

   590         //int32_t matchstart = *start;

   592         if (!ubrk_isBoundary(breakiterator, matchend)) {

   593             *end = ubrk_following(breakiterator, matchend);

   594         }

   596         /* Check the start of the matched text to make sure it doesn't have any accents

   597          * before it.  This code may not be necessary and so it is commented out */

   598         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {

   599             *start = ubrk_preceding(breakiterator, matchstart);

   600         }*/

   601     }

   602 #endif

   603 }

   605 /**

   606 * Determine whether the target text in UStringSearch bounded by the offset

   607 * start and end is one or more whole units of text as

   608 * determined by the breakiterator in UStringSearch.

   609 * @param strsrch string search data

   610 * @param start target text start offset

   611 * @param end target text end offset

   612 */

   613 static

   614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,

   615                                int32_t    end)

   616 {

   617 #if !UCONFIG_NO_BREAK_ITERATION

   618     UBreakIterator *breakiterator = strsrch->search->breakIter;

   619     //TODO: Add here.

   620     if (breakiterator) {

   621         int32_t startindex = ubrk_first(breakiterator);

   622         int32_t endindex   = ubrk_last(breakiterator);

   624         // out-of-range indexes are never boundary positions

   625         if (start < startindex || start > endindex ||

   626             end < startindex || end > endindex) {

   627             return FALSE;

   628         }

   629         // otherwise, we can use following() on the position before the

   630         // specified one and return true of the position we get back is the

   631         // one the user specified

   632         UBool result = (start == startindex ||

   633                 ubrk_following(breakiterator, start - 1) == start) &&

   634                (end == endindex ||

   635                 ubrk_following(breakiterator, end - 1) == end);

   636         if (result) {

   637             // iterates the individual ces

   638                   UCollationElements *coleiter  = strsrch->utilIter;

   639             const UChar              *text      = strsrch->search->text +

   640                                                                       start;

   641                   UErrorCode          status    = U_ZERO_ERROR;

   642             ucol_setText(coleiter, text, end - start, &status);

   643             for (int32_t count = 0; count < strsrch->pattern.CELength;

   644                  count ++) {

   645                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));

   646                 if (ce == UCOL_IGNORABLE) {

   647                     count --;

   648                     continue;

   649                 }

   650                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {

   651                     return FALSE;

   652                 }

   653             }

   654             int32_t nextce = ucol_next(coleiter, &status);

   655             while (ucol_getOffset(coleiter) == (end - start)

   656                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {

   657                 nextce = ucol_next(coleiter, &status);

   658             }

   659             if (ucol_getOffset(coleiter) == (end - start)

   660                 && nextce != UCOL_NULLORDER) {

   661                 // extra collation elements at the end of the match

   662                 return FALSE;

   663             }

   664         }

   665         return result;

   666     }

   667 #endif

   668     return TRUE;

   669 }

   671 /**

   672 * Getting the next base character offset if current offset is an accent,

   673 * or the current offset if the current character contains a base character.

   674 * accents the following base character will be returned

   675 * @param text string

   676 * @param textoffset current offset

   677 * @param textlength length of text string

   678 * @return the next base character or the current offset

   679 *         if the current character is contains a base character.

   680 */

   681 static

   682 inline int32_t getNextBaseOffset(const UChar       *text,

   683                                            int32_t  textoffset,

   684                                            int32_t      textlength)

   685 {

   686     if (textoffset < textlength) {

   687         int32_t temp = textoffset;

   688         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {

   689             while (temp < textlength) {

   690                 int32_t result = temp;

   691                 if ((getFCD(text, &temp, textlength) >>

   692                      SECOND_LAST_BYTE_SHIFT_) == 0) {

   693                     return result;

   694                 }

   695             }

   696             return textlength;

   697         }

   698     }

   699     return textoffset;

   700 }

   702 /**

   703 * Gets the next base character offset depending on the string search pattern

   704 * data

   705 * @param strsrch string search data

   706 * @param textoffset current offset, one offset away from the last character

   707 *                   to search for.

   708 * @return start index of the next base character or the current offset

   709 *         if the current character is contains a base character.

   710 */

   711 static

   712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,

   713                                                   int32_t    textoffset)

   714 {

   715     int32_t textlength = strsrch->search->textLength;

   716     if (strsrch->pattern.hasSuffixAccents &&

   717         textoffset < textlength) {

   718               int32_t  temp       = textoffset;

   719         const UChar       *text       = strsrch->search->text;

   720         U16_BACK_1(text, 0, temp);

   721         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {

   722             return getNextBaseOffset(text, textoffset, textlength);

   723         }

   724     }

   725     return textoffset;

   726 }

   728 /**

   729 * Shifting the collation element iterator position forward to prepare for

   730 * a following match. If the last character is a unsafe character, we'll only

   731 * shift by 1 to capture contractions, normalization etc.

   732 * Internal method, status assumed to be success.

   733 * @param text strsrch string search data

   734 * @param textoffset start text position to do search

   735 * @param ce the text ce which failed the match.

   736 * @param patternceindex index of the ce within the pattern ce buffer which

   737 *        failed the match

   738 * @return final offset

   739 */

   740 static

   741 inline int32_t shiftForward(UStringSearch *strsrch,

   742                                 int32_t    textoffset,

   743                                 int32_t       ce,

   744                                 int32_t        patternceindex)

   745 {

   746     UPattern *pattern = &(strsrch->pattern);

   747     if (ce != UCOL_NULLORDER) {

   748         int32_t shift = pattern->shift[hash(ce)];

   749         // this is to adjust for characters in the middle of the

   750         // substring for matching that failed.

   751         int32_t adjust = pattern->CELength - patternceindex;

   752         if (adjust > 1 && shift >= adjust) {

   753             shift -= adjust - 1;

   754         }

   755         textoffset += shift;

   756     }

   757     else {

   758         textoffset += pattern->defaultShiftSize;

   759     }

   761     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);

   762     // check for unsafe characters

   763     // * if it is the start or middle of a contraction: to be done after

   764     //   a initial match is found

   765     // * thai or lao base consonant character: similar to contraction

   766     // * high surrogate character: similar to contraction

   767     // * next character is a accent: shift to the next base character

   768     return textoffset;

   769 }

   770 #endif // #if BOYER_MOORE

   772 /**

   773 * sets match not found

   774 * @param strsrch string search data

   775 */

   776 static

   777 inline void setMatchNotFound(UStringSearch *strsrch)

   778 {

   779     // this method resets the match result regardless of the error status.

   780     strsrch->search->matchedIndex = USEARCH_DONE;

   781     strsrch->search->matchedLength = 0;

   782     if (strsrch->search->isForwardSearching) {

   783         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);

   784     }

   785     else {

   786         setColEIterOffset(strsrch->textIter, 0);

   787     }

   788 }

   790 #if BOYER_MOORE

   791 /**

   792 * Gets the offset to the next safe point in text.

   793 * ie. not the middle of a contraction, swappable characters or supplementary

   794 * characters.

   795 * @param collator collation sata

   796 * @param text string to work with

   797 * @param textoffset offset in string

   798 * @param textlength length of text string

   799 * @return offset to the next safe character

   800 */

   801 static

   802 inline int32_t getNextSafeOffset(const UCollator   *collator,

   803                                      const UChar       *text,

   804                                            int32_t  textoffset,

   805                                            int32_t      textlength)

   806 {

   807     int32_t result = textoffset; // first contraction character

   808     while (result != textlength && ucol_unsafeCP(text[result], collator)) {

   809         result ++;

   810     }

   811     return result;

   812 }

   814 /**

   815 * This checks for accents in the potential match started with a .

   816 * composite character.

   817 * This is really painful... we have to check that composite character do not

   818 * have any extra accents. We have to normalize the potential match and find

   819 * the immediate decomposed character before the match.

   820 * The first composite character would have been taken care of by the fcd

   821 * checks in checkForwardExactMatch.

   822 * This is the slow path after the fcd of the first character and

   823 * the last character has been checked by checkForwardExactMatch and we

   824 * determine that the potential match has extra non-ignorable preceding

   825 * ces.

   826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,

   827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA

   828 * Note here that accents checking are slow and cautioned in the API docs.

   829 * Internal method, status assumed to be a success, caller should check status

   830 * before calling this method

   831 * @param strsrch string search data

   832 * @param start index of the potential unfriendly composite character

   833 * @param end index of the potential unfriendly composite character

   834 * @param status output error status if any.

   835 * @return TRUE if there is non-ignorable accents before at the beginning

   836 *              of the match, FALSE otherwise.

   837 */

   839 static

   840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,

   841                                    int32_t    end,

   842                                    UErrorCode    *status)

   843 {

   844     UBool result = FALSE;

   845     if (strsrch->pattern.hasPrefixAccents) {

   846               int32_t  length = end - start;

   847               int32_t  offset = 0;

   848         const UChar       *text   = strsrch->search->text + start;

   850         U16_FWD_1(text, offset, length);

   851         // we are only concerned with the first composite character

   852         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {

   853             int32_t safeoffset = getNextSafeOffset(strsrch->collator,

   854                                                        text, 0, length);

   855             if (safeoffset != length) {

   856                 safeoffset ++;

   857             }

   858             UChar   *norm = NULL;

   859             UChar    buffer[INITIAL_ARRAY_SIZE_];

   860             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,

   861                                             buffer, INITIAL_ARRAY_SIZE_,

   862                                             status);

   863             if (U_FAILURE(*status)) {

   864                 return FALSE;

   865             }

   866             if (size >= INITIAL_ARRAY_SIZE_) {

   867                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),

   868                                                status);

   869                 // if allocation failed, status will be set to

   870                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally

   871                 // checks for it.

   872                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,

   873                                        size, status);

   874                 if (U_FAILURE(*status) && norm != NULL) {

   875                     uprv_free(norm);

   876                     return FALSE;

   877                 }

   878             }

   879             else {

   880                 norm = buffer;

   881             }

   883             UCollationElements *coleiter  = strsrch->utilIter;

   884             ucol_setText(coleiter, norm, size, status);

   885             uint32_t            firstce   = strsrch->pattern.CE[0];

   886             UBool               ignorable = TRUE;

   887             uint32_t            ce        = UCOL_IGNORABLE;

   888             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {

   889                 offset = ucol_getOffset(coleiter);

   890                 if (ce != firstce && ce != UCOL_IGNORABLE) {

   891                     ignorable = FALSE;

   892                 }

   893                 ce = ucol_next(coleiter, status);

   894             }

   895             UChar32 codepoint;

   896             U16_PREV(norm, 0, offset, codepoint);

   897             result = !ignorable && (u_getCombiningClass(codepoint) != 0);

   899             if (norm != buffer) {

   900                 uprv_free(norm);

   901             }

   902         }

   903     }

   905     return result;

   906 }

   908 /**

   909 * Used by exact matches, checks if there are accents before the match.

   910 * This is really painful... we have to check that composite characters at

   911 * the start of the matches have to not have any extra accents.

   912 * We check the FCD of the character first, if it starts with an accent and

   913 * the first pattern ce does not match the first ce of the character, we bail.

   914 * Otherwise we try normalizing the first composite

   915 * character and find the immediate decomposed character before the match to

   916 * see if it is an non-ignorable accent.

   917 * Now normalizing the first composite character is enough because we ensure

   918 * that when the match is passed in here with extra beginning ces, the

   919 * first or last ce that match has to occur within the first character.

   920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,

   921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA

   922 * Note here that accents checking are slow and cautioned in the API docs.

   923 * @param strsrch string search data

   924 * @param start offset

   925 * @param end offset

   926 * @return TRUE if there are accents on either side of the match,

   927 *         FALSE otherwise

   928 */

   929 static

   930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,

   931                                   int32_t    end)

   932 {

   933     if (strsrch->pattern.hasPrefixAccents) {

   934         UCollationElements *coleiter  = strsrch->textIter;

   935         UErrorCode          status    = U_ZERO_ERROR;

   936         // we have been iterating forwards previously

   937         uint32_t            ignorable = TRUE;

   938         int32_t             firstce   = strsrch->pattern.CE[0];

   940         setColEIterOffset(coleiter, start);

   941         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));

   942         if (U_FAILURE(status)) {

   943             return TRUE;

   944         }

   945         while (ce != firstce) {

   946             if (ce != UCOL_IGNORABLE) {

   947                 ignorable = FALSE;

   948             }

   949             ce = getCE(strsrch, ucol_next(coleiter, &status));

   950             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {

   951                 return TRUE;

   952             }

   953         }

   954         if (!ignorable && inNormBuf(coleiter)) {

   955             // within normalization buffer, discontiguous handled here

   956             return TRUE;

   957         }

   959         // within text

   960         int32_t temp = start;

   961         // original code

   962         // accent = (getFCD(strsrch->search->text, &temp,

   963         //                  strsrch->search->textLength)

   964         //            >> SECOND_LAST_BYTE_SHIFT_);

   965         // however this code does not work well with VC7 .net in release mode.

   966         // maybe the inlines for getFCD combined with shifting has bugs in

   967         // VC7. anyways this is a work around.

   968         UBool accent = getFCD(strsrch->search->text, &temp,

   969                               strsrch->search->textLength) > 0xFF;

   970         if (!accent) {

   971             return checkExtraMatchAccents(strsrch, start, end, &status);

   972         }

   973         if (!ignorable) {

   974             return TRUE;

   975         }

   976         if (start > 0) {

   977             temp = start;

   978             U16_BACK_1(strsrch->search->text, 0, temp);

   979             if (getFCD(strsrch->search->text, &temp,

   980                        strsrch->search->textLength) & LAST_BYTE_MASK_) {

   981                 setColEIterOffset(coleiter, start);

   982                 ce = ucol_previous(coleiter, &status);

   983                 if (U_FAILURE(status) ||

   984                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {

   985                     return TRUE;

   986                 }

   987             }

   988         }

   989     }

   991     return FALSE;

   992 }

   994 /**

   995 * Used by exact matches, checks if there are accents bounding the match.

   996 * Note this is the initial boundary check. If the potential match

   997 * starts or ends with composite characters, the accents in those

   998 * characters will be determined later.

   999 * Not doing backwards iteration here, since discontiguos contraction for

  1000 * backwards collation element iterator, use up too many characters.

  1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,

  1002 * should fail since there is a acute at the end of \u01FA

  1003 * Note here that accents checking are slow and cautioned in the API docs.

  1004 * @param strsrch string search data

  1005 * @param start offset of match

  1006 * @param end end offset of the match

  1007 * @return TRUE if there are accents on either side of the match,

  1008 *         FALSE otherwise

  1009 */

  1010 static

  1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,

  1012                                  int32_t    end)

  1013 {

  1014     if (strsrch->pattern.hasSuffixAccents) {

  1015         const UChar       *text       = strsrch->search->text;

  1016               int32_t  temp       = end;

  1017               int32_t      textlength = strsrch->search->textLength;

  1018         U16_BACK_1(text, 0, temp);

  1019         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {

  1020             int32_t             firstce  = strsrch->pattern.CE[0];

  1021             UCollationElements *coleiter = strsrch->textIter;

  1022             UErrorCode          status   = U_ZERO_ERROR;

  1023             int32_t ce;

  1024             setColEIterOffset(coleiter, start);

  1025             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {

  1026                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {

  1027                     return TRUE;

  1028                 }

  1029             }

  1030             int32_t count = 1;

  1031             while (count < strsrch->pattern.CELength) {

  1032                 if (getCE(strsrch, ucol_next(coleiter, &status))

  1033                     == UCOL_IGNORABLE) {

  1034                     // Thai can give an ignorable here.

  1035                     count --;

  1036                 }

  1037                 if (U_FAILURE(status)) {

  1038                     return TRUE;

  1039                 }

  1040                 count ++;

  1041             }

  1043             ce = ucol_next(coleiter, &status);

  1044             if (U_FAILURE(status)) {

  1045                 return TRUE;

  1046             }

  1047             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {

  1048                 ce = getCE(strsrch, ce);

  1049             }

  1050             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {

  1051                 if (ucol_getOffset(coleiter) <= end) {

  1052                     return TRUE;

  1053                 }

  1054                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {

  1055                     return TRUE;

  1056                 }

  1057             }

  1058         }

  1059     }

  1060     return FALSE;

  1061 }

  1062 #endif // #if BOYER_MOORE

  1064 /**

  1065 * Checks if the offset runs out of the text string

  1066 * @param offset

  1067 * @param textlength of the text string

  1068 * @return TRUE if offset is out of bounds, FALSE otherwise

  1069 */

  1070 static

  1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)

  1072 {

  1073     return offset < 0 || offset > textlength;

  1074 }

  1076 /**

  1077 * Checks for identical match

  1078 * @param strsrch string search data

  1079 * @param start offset of possible match

  1080 * @param end offset of possible match

  1081 * @return TRUE if identical match is found

  1082 */

  1083 static

  1084 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,

  1085                                   int32_t    end)

  1086 {

  1087     if (strsrch->strength != UCOL_IDENTICAL) {

  1088         return TRUE;

  1089     }

  1091     // Note: We could use Normalizer::compare() or similar, but for short strings

  1092     // which may not be in FCD it might be faster to just NFD them.

  1093     UErrorCode status = U_ZERO_ERROR;

  1094     UnicodeString t2, p2;

  1095     strsrch->nfd->normalize(

  1096         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);

  1097     strsrch->nfd->normalize(

  1098         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);

  1099     // return FALSE if NFD failed

  1100     return U_SUCCESS(status) && t2 == p2;

  1101 }

  1103 #if BOYER_MOORE

  1104 /**

  1105 * Checks to see if the match is repeated

  1106 * @param strsrch string search data

  1107 * @param start new match start index

  1108 * @param end new match end index

  1109 * @return TRUE if the the match is repeated, FALSE otherwise

  1110 */

  1111 static

  1112 inline UBool checkRepeatedMatch(UStringSearch *strsrch,

  1113                                 int32_t    start,

  1114                                 int32_t    end)

  1115 {

  1116     int32_t lastmatchindex = strsrch->search->matchedIndex;

  1117     UBool       result;

  1118     if (lastmatchindex == USEARCH_DONE) {

  1119         return FALSE;

  1120     }

  1121     if (strsrch->search->isForwardSearching) {

  1122         result = start <= lastmatchindex;

  1123     }

  1124     else {

  1125         result = start >= lastmatchindex;

  1126     }

  1127     if (!result && !strsrch->search->isOverlap) {

  1128         if (strsrch->search->isForwardSearching) {

  1129             result = start < lastmatchindex + strsrch->search->matchedLength;

  1130         }

  1131         else {

  1132             result = end > lastmatchindex;

  1133         }

  1134     }

  1135     return result;

  1136 }

  1138 /**

  1139 * Gets the collation element iterator's current offset.

  1140 * @param coleiter collation element iterator

  1141 * @param forwards flag TRUE if we are moving in th forwards direction

  1142 * @return current offset

  1143 */

  1144 static

  1145 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,

  1146                                               UBool               forwards)

  1147 {

  1148     int32_t result = ucol_getOffset(coleiter);

  1149     // intricacies of the the backwards collation element iterator

  1150     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {

  1151         result ++;

  1152     }

  1153     return result;

  1154 }

  1156 /**

  1157 * Checks match for contraction.

  1158 * If the match ends with a partial contraction we fail.

  1159 * If the match starts too far off (because of backwards iteration) we try to

  1160 * chip off the extra characters depending on whether a breakiterator has

  1161 * been used.

  1162 * Internal method, error assumed to be success, caller has to check status

  1163 * before calling this method.

  1164 * @param strsrch string search data

  1165 * @param start offset of potential match, to be modified if necessary

  1166 * @param end offset of potential match, to be modified if necessary

  1167 * @param status output error status if any

  1168 * @return TRUE if match passes the contraction test, FALSE otherwise

  1169 */

  1171 static

  1172 UBool checkNextExactContractionMatch(UStringSearch *strsrch,

  1173                                      int32_t   *start,

  1174                                      int32_t   *end, UErrorCode  *status)

  1175 {

  1176           UCollationElements *coleiter   = strsrch->textIter;

  1177           int32_t             textlength = strsrch->search->textLength;

  1178           int32_t             temp       = *start;

  1179     const UCollator          *collator   = strsrch->collator;

  1180     const UChar              *text       = strsrch->search->text;

  1181     // This part checks if either ends of the match contains potential

  1182     // contraction. If so we'll have to iterate through them

  1183     // The start contraction needs to be checked since ucol_previous dumps

  1184     // all characters till the first safe character into the buffer.

  1185     // *start + 1 is used to test for the unsafe characters instead of *start

  1186     // because ucol_prev takes all unsafe characters till the first safe

  1187     // character ie *start. so by testing *start + 1, we can estimate if

  1188     // excess prefix characters has been included in the potential search

  1189     // results.

  1190     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||

  1191         (*start + 1 < textlength

  1192          && ucol_unsafeCP(text[*start + 1], collator))) {

  1193         int32_t expansion  = getExpansionPrefix(coleiter);

  1194         UBool   expandflag = expansion > 0;

  1195         setColEIterOffset(coleiter, *start);

  1196         while (expansion > 0) {

  1197             // getting rid of the redundant ce, caused by setOffset.

  1198             // since backward contraction/expansion may have extra ces if we

  1199             // are in the normalization buffer, hasAccentsBeforeMatch would

  1200             // have taken care of it.

  1201             // E.g. the character \u01FA will have an expansion of 3, but if

  1202             // we are only looking for acute and ring \u030A and \u0301, we'll

  1203             // have to skip the first ce in the expansion buffer.

  1204             ucol_next(coleiter, status);

  1205             if (U_FAILURE(*status)) {

  1206                 return FALSE;

  1207             }

  1208             if (ucol_getOffset(coleiter) != temp) {

  1209                 *start = temp;

  1210                 temp  = ucol_getOffset(coleiter);

  1211             }

  1212             expansion --;

  1213         }

  1215         int32_t  *patternce       = strsrch->pattern.CE;

  1216         int32_t   patterncelength = strsrch->pattern.CELength;

  1217         int32_t   count           = 0;

  1218         while (count < patterncelength) {

  1219             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));

  1220             if (ce == UCOL_IGNORABLE) {

  1221                 continue;

  1222             }

  1223             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {

  1224                 *start = temp;

  1225                 temp   = ucol_getOffset(coleiter);

  1226             }

  1227             if (U_FAILURE(*status) || ce != patternce[count]) {

  1228                 (*end) ++;

  1229                 *end = getNextUStringSearchBaseOffset(strsrch, *end);

  1230                 return FALSE;

  1231             }

  1232             count ++;

  1233         }

  1234     }

  1235     return TRUE;

  1236 }

  1238 /**

  1239 * Checks and sets the match information if found.

  1240 * Checks

  1241 * <ul>

  1242 * <li> the potential match does not repeat the previous match

  1243 * <li> boundaries are correct

  1244 * <li> exact matches has no extra accents

  1245 * <li> identical matchesb

  1246 * <li> potential match does not end in the middle of a contraction

  1247 * <\ul>

  1248 * Otherwise the offset will be shifted to the next character.

  1249 * Internal method, status assumed to be success, caller has to check status

  1250 * before calling this method.

  1251 * @param strsrch string search data

  1252 * @param textoffset offset in the collation element text. the returned value

  1253 *        will be the truncated end offset of the match or the new start

  1254 *        search offset.

  1255 * @param status output error status if any

  1256 * @return TRUE if the match is valid, FALSE otherwise

  1257 */

  1258 static

  1259 inline UBool checkNextExactMatch(UStringSearch *strsrch,

  1260                                  int32_t   *textoffset, UErrorCode *status)

  1261 {

  1262     UCollationElements *coleiter = strsrch->textIter;

  1263     int32_t         start    = getColElemIterOffset(coleiter, FALSE);

  1265     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {

  1266         return FALSE;

  1267     }

  1269     // this totally matches, however we need to check if it is repeating

  1270     if (!isBreakUnit(strsrch, start, *textoffset) ||

  1271         checkRepeatedMatch(strsrch, start, *textoffset) ||

  1272         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||

  1273         !checkIdentical(strsrch, start, *textoffset) ||

  1274         hasAccentsAfterMatch(strsrch, start, *textoffset)) {

  1276         (*textoffset) ++;

  1277         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);

  1278         return FALSE;

  1279     }

  1281     //Add breakiterator boundary check for primary strength search.

  1282     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {

  1283         checkBreakBoundary(strsrch, &start, textoffset);

  1284     }

  1286     // totally match, we will get rid of the ending ignorables.

  1287     strsrch->search->matchedIndex  = start;

  1288     strsrch->search->matchedLength = *textoffset - start;

  1289     return TRUE;

  1290 }

  1292 /**

  1293 * Getting the previous base character offset, or the current offset if the

  1294 * current character is a base character

  1295 * @param text string

  1296 * @param textoffset one offset after the current character

  1297 * @return the offset of the next character after the base character or the first

  1298 *         composed character with accents

  1299 */

  1300 static

  1301 inline int32_t getPreviousBaseOffset(const UChar       *text,

  1302                                                int32_t  textoffset)

  1303 {

  1304     if (textoffset > 0) {

  1305         for (;;) {

  1306             int32_t result = textoffset;

  1307             U16_BACK_1(text, 0, textoffset);

  1308             int32_t temp = textoffset;

  1309             uint16_t fcd = getFCD(text, &temp, result);

  1310             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {

  1311                 if (fcd & LAST_BYTE_MASK_) {

  1312                     return textoffset;

  1313                 }

  1314                 return result;

  1315             }

  1316             if (textoffset == 0) {

  1317                 return 0;

  1318             }

  1319         }

  1320     }

  1321     return textoffset;

  1322 }

  1324 /**

  1325 * Getting the indexes of the accents that are not blocked in the argument

  1326 * accent array

  1327 * @param accents array of accents in nfd terminated by a 0.

  1328 * @param accentsindex array of indexes of the accents that are not blocked

  1329 */

  1330 static

  1331 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)

  1332 {

  1333     int32_t index     = 0;

  1334     int32_t     length    = u_strlen(accents);

  1335     UChar32     codepoint = 0;

  1336     int         cclass    = 0;

  1337     int         result    = 0;

  1338     int32_t temp;

  1339     while (index < length) {

  1340         temp = index;

  1341         U16_NEXT(accents, index, length, codepoint);

  1342         if (u_getCombiningClass(codepoint) != cclass) {

  1343             cclass        = u_getCombiningClass(codepoint);

  1344             accentsindex[result] = temp;

  1345             result ++;

  1346         }

  1347     }

  1348     accentsindex[result] = length;

  1349     return result;

  1350 }

  1352 /**

  1353 * Appends 3 UChar arrays to a destination array.

  1354 * Creates a new array if we run out of space. The caller will have to

  1355 * manually deallocate the newly allocated array.

  1356 * Internal method, status assumed to be success, caller has to check status

  1357 * before calling this method. destination not to be NULL and has at least

  1358 * size destinationlength.

  1359 * @param destination target array

  1360 * @param destinationlength target array size, returning the appended length

  1361 * @param source1 null-terminated first array

  1362 * @param source2 second array

  1363 * @param source2length length of seond array

  1364 * @param source3 null-terminated third array

  1365 * @param status error status if any

  1366 * @return new destination array, destination if there was no new allocation

  1367 */

  1368 static

  1369 inline UChar * addToUCharArray(      UChar      *destination,

  1370                                      int32_t    *destinationlength,

  1371                                const UChar      *source1,

  1372                                const UChar      *source2,

  1373                                      int32_t     source2length,

  1374                                const UChar      *source3,

  1375                                      UErrorCode *status)

  1376 {

  1377     int32_t source1length = source1 ? u_strlen(source1) : 0;

  1378     int32_t source3length = source3 ? u_strlen(source3) : 0;

  1379     if (*destinationlength < source1length + source2length + source3length +

  1380                                                                            1)

  1381     {

  1382         destination = (UChar *)allocateMemory(

  1383           (source1length + source2length + source3length + 1) * sizeof(UChar),

  1384           status);

  1385         // if error allocating memory, status will be

  1386         // U_MEMORY_ALLOCATION_ERROR

  1387         if (U_FAILURE(*status)) {

  1388             *destinationlength = 0;

  1389             return NULL;

  1390         }

  1391     }

  1392     if (source1length != 0) {

  1393         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);

  1394     }

  1395     if (source2length != 0) {

  1396         uprv_memcpy(destination + source1length, source2,

  1397                     sizeof(UChar) * source2length);

  1398     }

  1399     if (source3length != 0) {

  1400         uprv_memcpy(destination + source1length + source2length, source3,

  1401                     sizeof(UChar) * source3length);

  1402     }

  1403     *destinationlength = source1length + source2length + source3length;

  1404     return destination;

  1405 }

  1407 /**

  1408 * Running through a collation element iterator to see if the contents matches

  1409 * pattern in string search data

  1410 * @param strsrch string search data

  1411 * @param coleiter collation element iterator

  1412 * @return TRUE if a match if found, FALSE otherwise

  1413 */

  1414 static

  1415 inline UBool checkCollationMatch(const UStringSearch      *strsrch,

  1416                                        UCollationElements *coleiter)

  1417 {

  1418     int         patternceindex = strsrch->pattern.CELength;

  1419     int32_t    *patternce      = strsrch->pattern.CE;

  1420     UErrorCode  status = U_ZERO_ERROR;

  1421     while (patternceindex > 0) {

  1422         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));

  1423         if (ce == UCOL_IGNORABLE) {

  1424             continue;

  1425         }

  1426         if (U_FAILURE(status) || ce != *patternce) {

  1427             return FALSE;

  1428         }

  1429         patternce ++;

  1430         patternceindex --;

  1431     }

  1432     return TRUE;

  1433 }

  1435 /**

  1436 * Rearranges the front accents to try matching.

  1437 * Prefix accents in the text will be grouped according to their combining

  1438 * class and the groups will be mixed and matched to try find the perfect

  1439 * match with the pattern.

  1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

  1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

  1442 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

  1443 *         "\u0301\u0325".

  1444 * step 2: check if any of the generated substrings matches the pattern.

  1445 * Internal method, status is assumed to be success, caller has to check status

  1446 * before calling this method.

  1447 * @param strsrch string search match

  1448 * @param start first offset of the accents to start searching

  1449 * @param end start of the last accent set

  1450 * @param status output error status if any

  1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting

  1452 *         offset of the match. Note this start includes all preceding accents.

  1453 */

  1454 static

  1455 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,

  1456                                        int32_t    start,

  1457                                        int32_t    end,

  1458                                        UErrorCode    *status)

  1459 {

  1460     const UChar       *text       = strsrch->search->text;

  1461           int32_t      textlength = strsrch->search->textLength;

  1462           int32_t  tempstart  = start;

  1464     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {

  1465         // die... failed at a base character

  1466         return USEARCH_DONE;

  1467     }

  1469     int32_t offset = getNextBaseOffset(text, tempstart, textlength);

  1470     start = getPreviousBaseOffset(text, tempstart);

  1472     UChar       accents[INITIAL_ARRAY_SIZE_];

  1473     // normalizing the offensive string

  1474     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,

  1475                     INITIAL_ARRAY_SIZE_, status);

  1476     if (U_FAILURE(*status)) {

  1477         return USEARCH_DONE;

  1478     }

  1480     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];

  1481     int32_t         accentsize = getUnblockedAccentIndex(accents,

  1482                                                                  accentsindex);

  1483     int32_t         count      = (2 << (accentsize - 1)) - 1;

  1484     UChar               buffer[INITIAL_ARRAY_SIZE_];

  1485     UCollationElements *coleiter   = strsrch->utilIter;

  1486     while (U_SUCCESS(*status) && count > 0) {

  1487         UChar *rearrange = strsrch->canonicalPrefixAccents;

  1488         // copy the base characters

  1489         for (int k = 0; k < accentsindex[0]; k ++) {

  1490             *rearrange ++ = accents[k];

  1491         }

  1492         // forming all possible canonical rearrangement by dropping

  1493         // sets of accents

  1494         for (int i = 0; i <= accentsize - 1; i ++) {

  1495             int32_t mask = 1 << (accentsize - i - 1);

  1496             if (count & mask) {

  1497                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

  1498                     *rearrange ++ = accents[j];

  1499                 }

  1500             }

  1501         }

  1502         *rearrange = 0;

  1503         int32_t  matchsize = INITIAL_ARRAY_SIZE_;

  1504         UChar   *match     = addToUCharArray(buffer, &matchsize,

  1505                                            strsrch->canonicalPrefixAccents,

  1506                                            strsrch->search->text + offset,

  1507                                            end - offset,

  1508                                            strsrch->canonicalSuffixAccents,

  1509                                            status);

  1511         // if status is a failure, ucol_setText does nothing.

  1512         // run the collator iterator through this match

  1513         ucol_setText(coleiter, match, matchsize, status);

  1514         if (U_SUCCESS(*status)) {

  1515             if (checkCollationMatch(strsrch, coleiter)) {

  1516                 if (match != buffer) {

  1517                     uprv_free(match);

  1518                 }

  1519                 return start;

  1520             }

  1521         }

  1522         count --;

  1523     }

  1524     return USEARCH_DONE;

  1525 }

  1527 /**

  1528 * Gets the offset to the safe point in text before textoffset.

  1529 * ie. not the middle of a contraction, swappable characters or supplementary

  1530 * characters.

  1531 * @param collator collation sata

  1532 * @param text string to work with

  1533 * @param textoffset offset in string

  1534 * @param textlength length of text string

  1535 * @return offset to the previous safe character

  1536 */

  1537 static

  1538 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,

  1539                                       const UChar       *text,

  1540                                             int32_t  textoffset)

  1541 {

  1542     int32_t result = textoffset; // first contraction character

  1543     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {

  1544         result --;

  1545     }

  1546     if (result != 0) {

  1547         // the first contraction character is consider unsafe here

  1548         result --;

  1549     }

  1550     return result;

  1551 }

  1553 /**

  1554 * Cleaning up after we passed the safe zone

  1555 * @param strsrch string search data

  1556 * @param safetext safe text array

  1557 * @param safebuffer safe text buffer

  1558 * @param coleiter collation element iterator for safe text

  1559 */

  1560 static

  1561 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,

  1562                                   UChar         *safebuffer)

  1563 {

  1564     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)

  1565     {

  1566        uprv_free(safetext);

  1567     }

  1568 }

  1570 /**

  1571 * Take the rearranged end accents and tries matching. If match failed at

  1572 * a seperate preceding set of accents (seperated from the rearranged on by

  1573 * at least a base character) then we rearrange the preceding accents and

  1574 * tries matching again.

  1575 * We allow skipping of the ends of the accent set if the ces do not match.

  1576 * However if the failure is found before the accent set, it fails.

  1577 * Internal method, status assumed to be success, caller has to check status

  1578 * before calling this method.

  1579 * @param strsrch string search data

  1580 * @param textoffset of the start of the rearranged accent

  1581 * @param status output error status if any

  1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting

  1583 *         offset of the match. Note this start includes all preceding accents.

  1584 */

  1585 static

  1586 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,

  1587                                        int32_t    textoffset,

  1588                                        UErrorCode    *status)

  1589 {

  1590     const UChar              *text           = strsrch->search->text;

  1591     const UCollator          *collator       = strsrch->collator;

  1592           int32_t             safelength     = 0;

  1593           UChar              *safetext;

  1594           int32_t             safetextlength;

  1595           UChar               safebuffer[INITIAL_ARRAY_SIZE_];

  1596           UCollationElements *coleiter       = strsrch->utilIter;

  1597           int32_t         safeoffset     = textoffset;

  1599     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],

  1600                                          collator)) {

  1601         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);

  1602         safelength     = textoffset - safeoffset;

  1603         safetextlength = INITIAL_ARRAY_SIZE_;

  1604         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,

  1605                                          text + safeoffset, safelength,

  1606                                          strsrch->canonicalSuffixAccents,

  1607                                          status);

  1608     }

  1609     else {

  1610         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);

  1611         safetext       = strsrch->canonicalSuffixAccents;

  1612     }

  1614     // if status is a failure, ucol_setText does nothing

  1615     ucol_setText(coleiter, safetext, safetextlength, status);

  1616     // status checked in loop below

  1618     int32_t  *ce        = strsrch->pattern.CE;

  1619     int32_t   celength  = strsrch->pattern.CELength;

  1620     int       ceindex   = celength - 1;

  1621     UBool     isSafe    = TRUE; // indication flag for position in safe zone

  1623     while (ceindex >= 0) {

  1624         int32_t textce = ucol_previous(coleiter, status);

  1625         if (U_FAILURE(*status)) {

  1626             if (isSafe) {

  1627                 cleanUpSafeText(strsrch, safetext, safebuffer);

  1628             }

  1629             return USEARCH_DONE;

  1630         }

  1631         if (textce == UCOL_NULLORDER) {

  1632             // check if we have passed the safe buffer

  1633             if (coleiter == strsrch->textIter) {

  1634                 cleanUpSafeText(strsrch, safetext, safebuffer);

  1635                 return USEARCH_DONE;

  1636             }

  1637             cleanUpSafeText(strsrch, safetext, safebuffer);

  1638             safetext = safebuffer;

  1639             coleiter = strsrch->textIter;

  1640             setColEIterOffset(coleiter, safeoffset);

  1641             // status checked at the start of the loop

  1642             isSafe = FALSE;

  1643             continue;

  1644         }

  1645         textce = getCE(strsrch, textce);

  1646         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {

  1647             // do the beginning stuff

  1648             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);

  1649             if (isSafe && failedoffset >= safelength) {

  1650                 // alas... no hope. failed at rearranged accent set

  1651                 cleanUpSafeText(strsrch, safetext, safebuffer);

  1652                 return USEARCH_DONE;

  1653             }

  1654             else {

  1655                 if (isSafe) {

  1656                     failedoffset += safeoffset;

  1657                     cleanUpSafeText(strsrch, safetext, safebuffer);

  1658                 }

  1660                 // try rearranging the front accents

  1661                 int32_t result = doNextCanonicalPrefixMatch(strsrch,

  1662                                         failedoffset, textoffset, status);

  1663                 if (result != USEARCH_DONE) {

  1664                     // if status is a failure, ucol_setOffset does nothing

  1665                     setColEIterOffset(strsrch->textIter, result);

  1666                 }

  1667                 if (U_FAILURE(*status)) {

  1668                     return USEARCH_DONE;

  1669                 }

  1670                 return result;

  1671             }

  1672         }

  1673         if (textce == ce[ceindex]) {

  1674             ceindex --;

  1675         }

  1676     }

  1677     // set offset here

  1678     if (isSafe) {

  1679         int32_t result     = getColElemIterOffset(coleiter, FALSE);

  1680         // sets the text iterator here with the correct expansion and offset

  1681         int32_t    leftoverces = getExpansionPrefix(coleiter);

  1682         cleanUpSafeText(strsrch, safetext, safebuffer);

  1683         if (result >= safelength) {

  1684             result = textoffset;

  1685         }

  1686         else {

  1687             result += safeoffset;

  1688         }

  1689         setColEIterOffset(strsrch->textIter, result);

  1690         strsrch->textIter->iteratordata_.toReturn =

  1691                        setExpansionPrefix(strsrch->textIter, leftoverces);

  1692         return result;

  1693     }

  1695     return ucol_getOffset(coleiter);

  1696 }

  1698 /**

  1699 * Trying out the substring and sees if it can be a canonical match.

  1700 * This will try normalizing the end accents and arranging them into canonical

  1701 * equivalents and check their corresponding ces with the pattern ce.

  1702 * Suffix accents in the text will be grouped according to their combining

  1703 * class and the groups will be mixed and matched to try find the perfect

  1704 * match with the pattern.

  1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

  1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

  1707 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

  1708 *         "\u0301\u0325".

  1709 * step 2: check if any of the generated substrings matches the pattern.

  1710 * Internal method, status assumed to be success, caller has to check status

  1711 * before calling this method.

  1712 * @param strsrch string search data

  1713 * @param textoffset end offset in the collation element text that ends with

  1714 *                   the accents to be rearranged

  1715 * @param status error status if any

  1716 * @return TRUE if the match is valid, FALSE otherwise

  1717 */

  1718 static

  1719 UBool doNextCanonicalMatch(UStringSearch *strsrch,

  1720                            int32_t    textoffset,

  1721                            UErrorCode    *status)

  1722 {

  1723     const UChar       *text = strsrch->search->text;

  1724           int32_t  temp = textoffset;

  1725     U16_BACK_1(text, 0, temp);

  1726     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {

  1727         UCollationElements *coleiter = strsrch->textIter;

  1728         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);

  1729         if (strsrch->pattern.hasPrefixAccents) {

  1730             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,

  1731                                                 status);

  1732             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {

  1733                 setColEIterOffset(coleiter, offset);

  1734                 return TRUE;

  1735             }

  1736         }

  1737         return FALSE;

  1738     }

  1740     if (!strsrch->pattern.hasSuffixAccents) {

  1741         return FALSE;

  1742     }

  1744     UChar       accents[INITIAL_ARRAY_SIZE_];

  1745     // offset to the last base character in substring to search

  1746     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);

  1747     // normalizing the offensive string

  1748     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,

  1749                                0, accents, INITIAL_ARRAY_SIZE_, status);

  1750     // status checked in loop below

  1752     int32_t accentsindex[INITIAL_ARRAY_SIZE_];

  1753     int32_t size = getUnblockedAccentIndex(accents, accentsindex);

  1755     // 2 power n - 1 plus the full set of accents

  1756     int32_t  count = (2 << (size - 1)) - 1;

  1757     while (U_SUCCESS(*status) && count > 0) {

  1758         UChar *rearrange = strsrch->canonicalSuffixAccents;

  1759         // copy the base characters

  1760         for (int k = 0; k < accentsindex[0]; k ++) {

  1761             *rearrange ++ = accents[k];

  1762         }

  1763         // forming all possible canonical rearrangement by dropping

  1764         // sets of accents

  1765         for (int i = 0; i <= size - 1; i ++) {

  1766             int32_t mask = 1 << (size - i - 1);

  1767             if (count & mask) {

  1768                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

  1769                     *rearrange ++ = accents[j];

  1770                 }

  1771             }

  1772         }

  1773         *rearrange = 0;

  1774         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,

  1775                                                         status);

  1776         if (offset != USEARCH_DONE) {

  1777             return TRUE; // match found

  1778         }

  1779         count --;

  1780     }

  1781     return FALSE;

  1782 }

  1784 /**

  1785 * Gets the previous base character offset depending on the string search

  1786 * pattern data

  1787 * @param strsrch string search data

  1788 * @param textoffset current offset, current character

  1789 * @return the offset of the next character after this base character or itself

  1790 *         if it is a composed character with accents

  1791 */

  1792 static

  1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,

  1794                                                       int32_t textoffset)

  1795 {

  1796     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {

  1797         const UChar       *text = strsrch->search->text;

  1798               int32_t  offset = textoffset;

  1799         if (getFCD(text, &offset, strsrch->search->textLength) >>

  1800                                                    SECOND_LAST_BYTE_SHIFT_) {

  1801             return getPreviousBaseOffset(text, textoffset);

  1802         }

  1803     }

  1804     return textoffset;

  1805 }

  1807 /**

  1808 * Checks match for contraction.

  1809 * If the match ends with a partial contraction we fail.

  1810 * If the match starts too far off (because of backwards iteration) we try to

  1811 * chip off the extra characters

  1812 * Internal method, status assumed to be success, caller has to check status

  1813 * before calling this method.

  1814 * @param strsrch string search data

  1815 * @param start offset of potential match, to be modified if necessary

  1816 * @param end offset of potential match, to be modified if necessary

  1817 * @param status output error status if any

  1818 * @return TRUE if match passes the contraction test, FALSE otherwise

  1819 */

  1820 static

  1821 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,

  1822                                          int32_t   *start,

  1823                                          int32_t   *end,

  1824                                          UErrorCode    *status)

  1825 {

  1826           UCollationElements *coleiter   = strsrch->textIter;

  1827           int32_t             textlength = strsrch->search->textLength;

  1828           int32_t         temp       = *start;

  1829     const UCollator          *collator   = strsrch->collator;

  1830     const UChar              *text       = strsrch->search->text;

  1831     // This part checks if either ends of the match contains potential

  1832     // contraction. If so we'll have to iterate through them

  1833     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||

  1834         (*start + 1 < textlength

  1835          && ucol_unsafeCP(text[*start + 1], collator))) {

  1836         int32_t expansion  = getExpansionPrefix(coleiter);

  1837         UBool   expandflag = expansion > 0;

  1838         setColEIterOffset(coleiter, *start);

  1839         while (expansion > 0) {

  1840             // getting rid of the redundant ce, caused by setOffset.

  1841             // since backward contraction/expansion may have extra ces if we

  1842             // are in the normalization buffer, hasAccentsBeforeMatch would

  1843             // have taken care of it.

  1844             // E.g. the character \u01FA will have an expansion of 3, but if

  1845             // we are only looking for acute and ring \u030A and \u0301, we'll

  1846             // have to skip the first ce in the expansion buffer.

  1847             ucol_next(coleiter, status);

  1848             if (U_FAILURE(*status)) {

  1849                 return FALSE;

  1850             }

  1851             if (ucol_getOffset(coleiter) != temp) {

  1852                 *start = temp;

  1853                 temp  = ucol_getOffset(coleiter);

  1854             }

  1855             expansion --;

  1856         }

  1858         int32_t  *patternce       = strsrch->pattern.CE;

  1859         int32_t   patterncelength = strsrch->pattern.CELength;

  1860         int32_t   count           = 0;

  1861         int32_t   textlength      = strsrch->search->textLength;

  1862         while (count < patterncelength) {

  1863             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));

  1864             // status checked below, note that if status is a failure

  1865             // ucol_next returns UCOL_NULLORDER

  1866             if (ce == UCOL_IGNORABLE) {

  1867                 continue;

  1868             }

  1869             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {

  1870                 *start = temp;

  1871                 temp   = ucol_getOffset(coleiter);

  1872             }

  1874             if (count == 0 && ce != patternce[0]) {

  1875                 // accents may have extra starting ces, this occurs when a

  1876                 // pure accent pattern is matched without rearrangement

  1877                 // text \u0325\u0300 and looking for \u0300

  1878                 int32_t expected = patternce[0];

  1879                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {

  1880                     ce = getCE(strsrch, ucol_next(coleiter, status));

  1881                     while (U_SUCCESS(*status) && ce != expected &&

  1882                            ce != UCOL_NULLORDER &&

  1883                            ucol_getOffset(coleiter) <= *end) {

  1884                         ce = getCE(strsrch, ucol_next(coleiter, status));

  1885                     }

  1886                 }

  1887             }

  1888             if (U_FAILURE(*status) || ce != patternce[count]) {

  1889                 (*end) ++;

  1890                 *end = getNextUStringSearchBaseOffset(strsrch, *end);

  1891                 return FALSE;

  1892             }

  1893             count ++;

  1894         }

  1895     }

  1896     return TRUE;

  1897 }

  1899 /**

  1900 * Checks and sets the match information if found.

  1901 * Checks

  1902 * <ul>

  1903 * <li> the potential match does not repeat the previous match

  1904 * <li> boundaries are correct

  1905 * <li> potential match does not end in the middle of a contraction

  1906 * <li> identical matches

  1907 * <\ul>

  1908 * Otherwise the offset will be shifted to the next character.

  1909 * Internal method, status assumed to be success, caller has to check the

  1910 * status before calling this method.

  1911 * @param strsrch string search data

  1912 * @param textoffset offset in the collation element text. the returned value

  1913 *        will be the truncated end offset of the match or the new start

  1914 *        search offset.

  1915 * @param status output error status if any

  1916 * @return TRUE if the match is valid, FALSE otherwise

  1917 */

  1918 static

  1919 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,

  1920                                      int32_t   *textoffset,

  1921                                      UErrorCode    *status)

  1922 {

  1923     // to ensure that the start and ends are not composite characters

  1924     UCollationElements *coleiter = strsrch->textIter;

  1925     // if we have a canonical accent match

  1926     if ((strsrch->pattern.hasSuffixAccents &&

  1927         strsrch->canonicalSuffixAccents[0]) ||

  1928         (strsrch->pattern.hasPrefixAccents &&

  1929         strsrch->canonicalPrefixAccents[0])) {

  1930         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(

  1931                                                     strsrch,

  1932                                                     ucol_getOffset(coleiter));

  1933         strsrch->search->matchedLength = *textoffset -

  1934                                                 strsrch->search->matchedIndex;

  1935         return TRUE;

  1936     }

  1938     int32_t start = getColElemIterOffset(coleiter, FALSE);

  1939     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,

  1940                                             status) || U_FAILURE(*status)) {

  1941         return FALSE;

  1942     }

  1944     start = getPreviousUStringSearchBaseOffset(strsrch, start);

  1945     // this totally matches, however we need to check if it is repeating

  1946     if (checkRepeatedMatch(strsrch, start, *textoffset) ||

  1947         !isBreakUnit(strsrch, start, *textoffset) ||

  1948         !checkIdentical(strsrch, start, *textoffset)) {

  1949         (*textoffset) ++;

  1950         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,

  1951                                         strsrch->search->textLength);

  1952         return FALSE;

  1953     }

  1955     strsrch->search->matchedIndex  = start;

  1956     strsrch->search->matchedLength = *textoffset - start;

  1957     return TRUE;

  1958 }

  1960 /**

  1961 * Shifting the collation element iterator position forward to prepare for

  1962 * a preceding match. If the first character is a unsafe character, we'll only

  1963 * shift by 1 to capture contractions, normalization etc.

  1964 * Internal method, status assumed to be success, caller has to check status

  1965 * before calling this method.

  1966 * @param text strsrch string search data

  1967 * @param textoffset start text position to do search

  1968 * @param ce the text ce which failed the match.

  1969 * @param patternceindex index of the ce within the pattern ce buffer which

  1970 *        failed the match

  1971 * @return final offset

  1972 */

  1973 static

  1974 inline int32_t reverseShift(UStringSearch *strsrch,

  1975                                 int32_t    textoffset,

  1976                                 int32_t       ce,

  1977                                 int32_t        patternceindex)

  1978 {

  1979     if (strsrch->search->isOverlap) {

  1980         if (textoffset != strsrch->search->textLength) {

  1981             textoffset --;

  1982         }

  1983         else {

  1984             textoffset -= strsrch->pattern.defaultShiftSize;

  1985         }

  1986     }

  1987     else {

  1988         if (ce != UCOL_NULLORDER) {

  1989             int32_t shift = strsrch->pattern.backShift[hash(ce)];

  1991             // this is to adjust for characters in the middle of the substring

  1992             // for matching that failed.

  1993             int32_t adjust = patternceindex;

  1994             if (adjust > 1 && shift > adjust) {

  1995                 shift -= adjust - 1;

  1996             }

  1997             textoffset -= shift;

  1998         }

  1999         else {

  2000             textoffset -= strsrch->pattern.defaultShiftSize;

  2001         }

  2002     }

  2003     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);

  2004     return textoffset;

  2005 }

  2007 /**

  2008 * Checks match for contraction.

  2009 * If the match starts with a partial contraction we fail.

  2010 * Internal method, status assumed to be success, caller has to check status

  2011 * before calling this method.

  2012 * @param strsrch string search data

  2013 * @param start offset of potential match, to be modified if necessary

  2014 * @param end offset of potential match, to be modified if necessary

  2015 * @param status output error status if any

  2016 * @return TRUE if match passes the contraction test, FALSE otherwise

  2017 */

  2018 static

  2019 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,

  2020                                      int32_t   *start,

  2021                                      int32_t   *end, UErrorCode  *status)

  2022 {

  2023           UCollationElements *coleiter   = strsrch->textIter;

  2024           int32_t             textlength = strsrch->search->textLength;

  2025           int32_t             temp       = *end;

  2026     const UCollator          *collator   = strsrch->collator;

  2027     const UChar              *text       = strsrch->search->text;

  2028     // This part checks if either if the start of the match contains potential

  2029     // contraction. If so we'll have to iterate through them

  2030     // Since we used ucol_next while previously looking for the potential

  2031     // match, this guarantees that our end will not be a partial contraction,

  2032     // or a partial supplementary character.

  2033     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {

  2034         int32_t expansion  = getExpansionSuffix(coleiter);

  2035         UBool   expandflag = expansion > 0;

  2036         setColEIterOffset(coleiter, *end);

  2037         while (U_SUCCESS(*status) && expansion > 0) {

  2038             // getting rid of the redundant ce

  2039             // since forward contraction/expansion may have extra ces

  2040             // if we are in the normalization buffer, hasAccentsBeforeMatch

  2041             // would have taken care of it.

  2042             // E.g. the character \u01FA will have an expansion of 3, but if

  2043             // we are only looking for A ring A\u030A, we'll have to skip the

  2044             // last ce in the expansion buffer

  2045             ucol_previous(coleiter, status);

  2046             if (U_FAILURE(*status)) {

  2047                 return FALSE;

  2048             }

  2049             if (ucol_getOffset(coleiter) != temp) {

  2050                 *end = temp;

  2051                 temp  = ucol_getOffset(coleiter);

  2052             }

  2053             expansion --;

  2054         }

  2056         int32_t  *patternce       = strsrch->pattern.CE;

  2057         int32_t   patterncelength = strsrch->pattern.CELength;

  2058         int32_t   count           = patterncelength;

  2059         while (count > 0) {

  2060             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));

  2061             // status checked below, note that if status is a failure

  2062             // ucol_previous returns UCOL_NULLORDER

  2063             if (ce == UCOL_IGNORABLE) {

  2064                 continue;

  2065             }

  2066             if (expandflag && count == 0 &&

  2067                 getColElemIterOffset(coleiter, FALSE) != temp) {

  2068                 *end = temp;

  2069                 temp  = ucol_getOffset(coleiter);

  2070             }

  2071             if (U_FAILURE(*status) || ce != patternce[count - 1]) {

  2072                 (*start) --;

  2073                 *start = getPreviousBaseOffset(text, *start);

  2074                 return FALSE;

  2075             }

  2076             count --;

  2077         }

  2078     }

  2079     return TRUE;

  2080 }

  2082 /**

  2083 * Checks and sets the match information if found.

  2084 * Checks

  2085 * <ul>

  2086 * <li> the current match does not repeat the last match

  2087 * <li> boundaries are correct

  2088 * <li> exact matches has no extra accents

  2089 * <li> identical matches

  2090 * <\ul>

  2091 * Otherwise the offset will be shifted to the preceding character.

  2092 * Internal method, status assumed to be success, caller has to check status

  2093 * before calling this method.

  2094 * @param strsrch string search data

  2095 * @param collator

  2096 * @param coleiter collation element iterator

  2097 * @param text string

  2098 * @param textoffset offset in the collation element text. the returned value

  2099 *        will be the truncated start offset of the match or the new start

  2100 *        search offset.

  2101 * @param status output error status if any

  2102 * @return TRUE if the match is valid, FALSE otherwise

  2103 */

  2104 static

  2105 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,

  2106                                      int32_t   *textoffset,

  2107                                      UErrorCode    *status)

  2108 {

  2109     // to ensure that the start and ends are not composite characters

  2110     int32_t end = ucol_getOffset(strsrch->textIter);

  2111     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)

  2112         || U_FAILURE(*status)) {

  2113             return FALSE;

  2114     }

  2116     // this totally matches, however we need to check if it is repeating

  2117     // the old match

  2118     if (checkRepeatedMatch(strsrch, *textoffset, end) ||

  2119         !isBreakUnit(strsrch, *textoffset, end) ||

  2120         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||

  2121         !checkIdentical(strsrch, *textoffset, end) ||

  2122         hasAccentsAfterMatch(strsrch, *textoffset, end)) {

  2123         (*textoffset) --;

  2124         *textoffset = getPreviousBaseOffset(strsrch->search->text,

  2125                                             *textoffset);

  2126         return FALSE;

  2127     }

  2129     //Add breakiterator boundary check for primary strength search.

  2130     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {

  2131         checkBreakBoundary(strsrch, textoffset, &end);

  2132     }

  2134     strsrch->search->matchedIndex = *textoffset;

  2135     strsrch->search->matchedLength = end - *textoffset;

  2136     return TRUE;

  2137 }

  2139 /**

  2140 * Rearranges the end accents to try matching.

  2141 * Suffix accents in the text will be grouped according to their combining

  2142 * class and the groups will be mixed and matched to try find the perfect

  2143 * match with the pattern.

  2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

  2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

  2146 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

  2147 *         "\u0301\u0325".

  2148 * step 2: check if any of the generated substrings matches the pattern.

  2149 * Internal method, status assumed to be success, user has to check status

  2150 * before calling this method.

  2151 * @param strsrch string search match

  2152 * @param start offset of the first base character

  2153 * @param end start of the last accent set

  2154 * @param status only error status if any

  2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending

  2156 *         offset of the match. Note this start includes all following accents.

  2157 */

  2158 static

  2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,

  2160                                            int32_t    start,

  2161                                            int32_t    end,

  2162                                            UErrorCode    *status)

  2163 {

  2164     const UChar       *text       = strsrch->search->text;

  2165           int32_t  tempend    = end;

  2167     U16_BACK_1(text, 0, tempend);

  2168     if (!(getFCD(text, &tempend, strsrch->search->textLength) &

  2169                                                            LAST_BYTE_MASK_)) {

  2170         // die... failed at a base character

  2171         return USEARCH_DONE;

  2172     }

  2173     end = getNextBaseOffset(text, end, strsrch->search->textLength);

  2175     if (U_SUCCESS(*status)) {

  2176         UChar       accents[INITIAL_ARRAY_SIZE_];

  2177         int32_t offset = getPreviousBaseOffset(text, end);

  2178         // normalizing the offensive string

  2179         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,

  2180                         INITIAL_ARRAY_SIZE_, status);

  2182         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];

  2183         int32_t         accentsize = getUnblockedAccentIndex(accents,

  2184                                                          accentsindex);

  2185         int32_t         count      = (2 << (accentsize - 1)) - 1;

  2186         UChar               buffer[INITIAL_ARRAY_SIZE_];

  2187         UCollationElements *coleiter = strsrch->utilIter;

  2188         while (U_SUCCESS(*status) && count > 0) {

  2189             UChar *rearrange = strsrch->canonicalSuffixAccents;

  2190             // copy the base characters

  2191             for (int k = 0; k < accentsindex[0]; k ++) {

  2192                 *rearrange ++ = accents[k];

  2193             }

  2194             // forming all possible canonical rearrangement by dropping

  2195             // sets of accents

  2196             for (int i = 0; i <= accentsize - 1; i ++) {

  2197                 int32_t mask = 1 << (accentsize - i - 1);

  2198                 if (count & mask) {

  2199                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

  2200                         *rearrange ++ = accents[j];

  2201                     }

  2202                 }

  2203             }

  2204             *rearrange = 0;

  2205             int32_t  matchsize = INITIAL_ARRAY_SIZE_;

  2206             UChar   *match     = addToUCharArray(buffer, &matchsize,

  2207                                            strsrch->canonicalPrefixAccents,

  2208                                            strsrch->search->text + start,

  2209                                            offset - start,

  2210                                            strsrch->canonicalSuffixAccents,

  2211                                            status);

  2213             // run the collator iterator through this match

  2214             // if status is a failure ucol_setText does nothing

  2215             ucol_setText(coleiter, match, matchsize, status);

  2216             if (U_SUCCESS(*status)) {

  2217                 if (checkCollationMatch(strsrch, coleiter)) {

  2218                     if (match != buffer) {

  2219                         uprv_free(match);

  2220                     }

  2221                     return end;

  2222                 }

  2223             }

  2224             count --;

  2225         }

  2226     }

  2227     return USEARCH_DONE;

  2228 }

  2230 /**

  2231 * Take the rearranged start accents and tries matching. If match failed at

  2232 * a seperate following set of accents (seperated from the rearranged on by

  2233 * at least a base character) then we rearrange the preceding accents and

  2234 * tries matching again.

  2235 * We allow skipping of the ends of the accent set if the ces do not match.

  2236 * However if the failure is found before the accent set, it fails.

  2237 * Internal method, status assumed to be success, caller has to check status

  2238 * before calling this method.

  2239 * @param strsrch string search data

  2240 * @param textoffset of the ends of the rearranged accent

  2241 * @param status output error status if any

  2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending

  2243 *         offset of the match. Note this start includes all following accents.

  2244 */

  2245 static

  2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,

  2247                                            int32_t    textoffset,

  2248                                            UErrorCode    *status)

  2249 {

  2250     const UChar       *text       = strsrch->search->text;

  2251     const UCollator   *collator   = strsrch->collator;

  2252           int32_t      safelength = 0;

  2253           UChar       *safetext;

  2254           int32_t      safetextlength;

  2255           UChar        safebuffer[INITIAL_ARRAY_SIZE_];

  2256           int32_t  safeoffset = textoffset;

  2258     if (textoffset &&

  2259         ucol_unsafeCP(strsrch->canonicalPrefixAccents[

  2260                                  u_strlen(strsrch->canonicalPrefixAccents) - 1

  2261                                          ], collator)) {

  2262         safeoffset     = getNextSafeOffset(collator, text, textoffset,

  2263                                            strsrch->search->textLength);

  2264         safelength     = safeoffset - textoffset;

  2265         safetextlength = INITIAL_ARRAY_SIZE_;

  2266         safetext       = addToUCharArray(safebuffer, &safetextlength,

  2267                                          strsrch->canonicalPrefixAccents,

  2268                                          text + textoffset, safelength,

  2269                                          NULL, status);

  2270     }

  2271     else {

  2272         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);

  2273         safetext       = strsrch->canonicalPrefixAccents;

  2274     }

  2276     UCollationElements *coleiter = strsrch->utilIter;

  2277      // if status is a failure, ucol_setText does nothing

  2278     ucol_setText(coleiter, safetext, safetextlength, status);

  2279     // status checked in loop below

  2281     int32_t  *ce           = strsrch->pattern.CE;

  2282     int32_t   celength     = strsrch->pattern.CELength;

  2283     int       ceindex      = 0;

  2284     UBool     isSafe       = TRUE; // safe zone indication flag for position

  2285     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);

  2287     while (ceindex < celength) {

  2288         int32_t textce = ucol_next(coleiter, status);

  2289         if (U_FAILURE(*status)) {

  2290             if (isSafe) {

  2291                 cleanUpSafeText(strsrch, safetext, safebuffer);

  2292             }

  2293             return USEARCH_DONE;

  2294         }

  2295         if (textce == UCOL_NULLORDER) {

  2296             // check if we have passed the safe buffer

  2297             if (coleiter == strsrch->textIter) {

  2298                 cleanUpSafeText(strsrch, safetext, safebuffer);

  2299                 return USEARCH_DONE;

  2300             }

  2301             cleanUpSafeText(strsrch, safetext, safebuffer);

  2302             safetext = safebuffer;

  2303             coleiter = strsrch->textIter;

  2304             setColEIterOffset(coleiter, safeoffset);

  2305             // status checked at the start of the loop

  2306             isSafe = FALSE;

  2307             continue;

  2308         }

  2309         textce = getCE(strsrch, textce);

  2310         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {

  2311             // do the beginning stuff

  2312             int32_t failedoffset = ucol_getOffset(coleiter);

  2313             if (isSafe && failedoffset <= prefixlength) {

  2314                 // alas... no hope. failed at rearranged accent set

  2315                 cleanUpSafeText(strsrch, safetext, safebuffer);

  2316                 return USEARCH_DONE;

  2317             }

  2318             else {

  2319                 if (isSafe) {

  2320                     failedoffset = safeoffset - failedoffset;

  2321                     cleanUpSafeText(strsrch, safetext, safebuffer);

  2322                 }

  2324                 // try rearranging the end accents

  2325                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,

  2326                                         textoffset, failedoffset, status);

  2327                 if (result != USEARCH_DONE) {

  2328                     // if status is a failure, ucol_setOffset does nothing

  2329                     setColEIterOffset(strsrch->textIter, result);

  2330                 }

  2331                 if (U_FAILURE(*status)) {

  2332                     return USEARCH_DONE;

  2333                 }

  2334                 return result;

  2335             }

  2336         }

  2337         if (textce == ce[ceindex]) {

  2338             ceindex ++;

  2339         }

  2340     }

  2341     // set offset here

  2342     if (isSafe) {

  2343         int32_t result      = ucol_getOffset(coleiter);

  2344         // sets the text iterator here with the correct expansion and offset

  2345         int32_t     leftoverces = getExpansionSuffix(coleiter);

  2346         cleanUpSafeText(strsrch, safetext, safebuffer);

  2347         if (result <= prefixlength) {

  2348             result = textoffset;

  2349         }

  2350         else {

  2351             result = textoffset + (safeoffset - result);

  2352         }

  2353         setColEIterOffset(strsrch->textIter, result);

  2354         setExpansionSuffix(strsrch->textIter, leftoverces);

  2355         return result;

  2356     }

  2358     return ucol_getOffset(coleiter);

  2359 }

  2361 /**

  2362 * Trying out the substring and sees if it can be a canonical match.

  2363 * This will try normalizing the starting accents and arranging them into

  2364 * canonical equivalents and check their corresponding ces with the pattern ce.

  2365 * Prefix accents in the text will be grouped according to their combining

  2366 * class and the groups will be mixed and matched to try find the perfect

  2367 * match with the pattern.

  2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"

  2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings

  2370 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",

  2371 *         "\u0301\u0325".

  2372 * step 2: check if any of the generated substrings matches the pattern.

  2373 * Internal method, status assumed to be success, caller has to check status

  2374 * before calling this method.

  2375 * @param strsrch string search data

  2376 * @param textoffset start offset in the collation element text that starts

  2377 *                   with the accents to be rearranged

  2378 * @param status output error status if any

  2379 * @return TRUE if the match is valid, FALSE otherwise

  2380 */

  2381 static

  2382 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,

  2383                                int32_t    textoffset,

  2384                                UErrorCode    *status)

  2385 {

  2386     const UChar       *text       = strsrch->search->text;

  2387           int32_t  temp       = textoffset;

  2388           int32_t      textlength = strsrch->search->textLength;

  2389     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {

  2390         UCollationElements *coleiter = strsrch->textIter;

  2391         int32_t         offset   = ucol_getOffset(coleiter);

  2392         if (strsrch->pattern.hasSuffixAccents) {

  2393             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,

  2394                                                     offset, status);

  2395             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {

  2396                 setColEIterOffset(coleiter, offset);

  2397                 return TRUE;

  2398             }

  2399         }

  2400         return FALSE;

  2401     }

  2403     if (!strsrch->pattern.hasPrefixAccents) {

  2404         return FALSE;

  2405     }

  2407     UChar       accents[INITIAL_ARRAY_SIZE_];

  2408     // offset to the last base character in substring to search

  2409     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);

  2410     // normalizing the offensive string

  2411     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,

  2412                                0, accents, INITIAL_ARRAY_SIZE_, status);

  2413     // status checked in loop

  2415     int32_t accentsindex[INITIAL_ARRAY_SIZE_];

  2416     int32_t size = getUnblockedAccentIndex(accents, accentsindex);

  2418     // 2 power n - 1 plus the full set of accents

  2419     int32_t  count = (2 << (size - 1)) - 1;

  2420     while (U_SUCCESS(*status) && count > 0) {

  2421         UChar *rearrange = strsrch->canonicalPrefixAccents;

  2422         // copy the base characters

  2423         for (int k = 0; k < accentsindex[0]; k ++) {

  2424             *rearrange ++ = accents[k];

  2425         }

  2426         // forming all possible canonical rearrangement by dropping

  2427         // sets of accents

  2428         for (int i = 0; i <= size - 1; i ++) {

  2429             int32_t mask = 1 << (size - i - 1);

  2430             if (count & mask) {

  2431                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {

  2432                     *rearrange ++ = accents[j];

  2433                 }

  2434             }

  2435         }

  2436         *rearrange = 0;

  2437         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,

  2438                                                           baseoffset, status);

  2439         if (offset != USEARCH_DONE) {

  2440             return TRUE; // match found

  2441         }

  2442         count --;

  2443     }

  2444     return FALSE;

  2445 }

  2447 /**

  2448 * Checks match for contraction.

  2449 * If the match starts with a partial contraction we fail.

  2450 * Internal method, status assumed to be success, caller has to check status

  2451 * before calling this method.

  2452 * @param strsrch string search data

  2453 * @param start offset of potential match, to be modified if necessary

  2454 * @param end offset of potential match, to be modified if necessary

  2455 * @param status only error status if any

  2456 * @return TRUE if match passes the contraction test, FALSE otherwise

  2457 */

  2458 static

  2459 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,

  2460                                      int32_t   *start,

  2461                                      int32_t   *end, UErrorCode  *status)

  2462 {

  2463           UCollationElements *coleiter   = strsrch->textIter;

  2464           int32_t             textlength = strsrch->search->textLength;

  2465           int32_t         temp       = *end;

  2466     const UCollator          *collator   = strsrch->collator;

  2467     const UChar              *text       = strsrch->search->text;

  2468     // This part checks if either if the start of the match contains potential

  2469     // contraction. If so we'll have to iterate through them

  2470     // Since we used ucol_next while previously looking for the potential

  2471     // match, this guarantees that our end will not be a partial contraction,

  2472     // or a partial supplementary character.

  2473     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {

  2474         int32_t expansion  = getExpansionSuffix(coleiter);

  2475         UBool   expandflag = expansion > 0;

  2476         setColEIterOffset(coleiter, *end);

  2477         while (expansion > 0) {

  2478             // getting rid of the redundant ce

  2479             // since forward contraction/expansion may have extra ces

  2480             // if we are in the normalization buffer, hasAccentsBeforeMatch

  2481             // would have taken care of it.

  2482             // E.g. the character \u01FA will have an expansion of 3, but if

  2483             // we are only looking for A ring A\u030A, we'll have to skip the

  2484             // last ce in the expansion buffer

  2485             ucol_previous(coleiter, status);

  2486             if (U_FAILURE(*status)) {

  2487                 return FALSE;

  2488             }

  2489             if (ucol_getOffset(coleiter) != temp) {

  2490                 *end = temp;

  2491                 temp  = ucol_getOffset(coleiter);

  2492             }

  2493             expansion --;

  2494         }

  2496         int32_t  *patternce       = strsrch->pattern.CE;

  2497         int32_t   patterncelength = strsrch->pattern.CELength;

  2498         int32_t   count           = patterncelength;

  2499         while (count > 0) {

  2500             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));

  2501             // status checked below, note that if status is a failure

  2502             // ucol_previous returns UCOL_NULLORDER

  2503             if (ce == UCOL_IGNORABLE) {

  2504                 continue;

  2505             }

  2506             if (expandflag && count == 0 &&

  2507                 getColElemIterOffset(coleiter, FALSE) != temp) {

  2508                 *end = temp;

  2509                 temp  = ucol_getOffset(coleiter);

  2510             }

  2511             if (count == patterncelength &&

  2512                 ce != patternce[patterncelength - 1]) {

  2513                 // accents may have extra starting ces, this occurs when a

  2514                 // pure accent pattern is matched without rearrangement

  2515                 int32_t    expected = patternce[patterncelength - 1];

  2516                 U16_BACK_1(text, 0, *end);

  2517                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {

  2518                     ce = getCE(strsrch, ucol_previous(coleiter, status));

  2519                     while (U_SUCCESS(*status) && ce != expected &&

  2520                            ce != UCOL_NULLORDER &&

  2521                            ucol_getOffset(coleiter) <= *start) {

  2522                         ce = getCE(strsrch, ucol_previous(coleiter, status));

  2523                     }

  2524                 }

  2525             }

  2526             if (U_FAILURE(*status) || ce != patternce[count - 1]) {

  2527                 (*start) --;

  2528                 *start = getPreviousBaseOffset(text, *start);

  2529                 return FALSE;

  2530             }

  2531             count --;

  2532         }

  2533     }

  2534     return TRUE;

  2535 }

  2537 /**

  2538 * Checks and sets the match information if found.

  2539 * Checks

  2540 * <ul>

  2541 * <li> the potential match does not repeat the previous match

  2542 * <li> boundaries are correct

  2543 * <li> potential match does not end in the middle of a contraction

  2544 * <li> identical matches

  2545 * <\ul>

  2546 * Otherwise the offset will be shifted to the next character.

  2547 * Internal method, status assumed to be success, caller has to check status

  2548 * before calling this method.

  2549 * @param strsrch string search data

  2550 * @param textoffset offset in the collation element text. the returned value

  2551 *        will be the truncated start offset of the match or the new start

  2552 *        search offset.

  2553 * @param status only error status if any

  2554 * @return TRUE if the match is valid, FALSE otherwise

  2555 */

  2556 static

  2557 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,

  2558                                          int32_t   *textoffset,

  2559                                          UErrorCode    *status)

  2560 {

  2561     // to ensure that the start and ends are not composite characters

  2562     UCollationElements *coleiter = strsrch->textIter;

  2563     // if we have a canonical accent match

  2564     if ((strsrch->pattern.hasSuffixAccents &&

  2565         strsrch->canonicalSuffixAccents[0]) ||

  2566         (strsrch->pattern.hasPrefixAccents &&

  2567         strsrch->canonicalPrefixAccents[0])) {

  2568         strsrch->search->matchedIndex  = *textoffset;

  2569         strsrch->search->matchedLength =

  2570             getNextUStringSearchBaseOffset(strsrch,

  2571                                       getColElemIterOffset(coleiter, FALSE))

  2572             - *textoffset;

  2573         return TRUE;

  2574     }

  2576     int32_t end = ucol_getOffset(coleiter);

  2577     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,

  2578                                                 status) ||

  2579          U_FAILURE(*status)) {

  2580         return FALSE;

  2581     }

  2583     end = getNextUStringSearchBaseOffset(strsrch, end);

  2584     // this totally matches, however we need to check if it is repeating

  2585     if (checkRepeatedMatch(strsrch, *textoffset, end) ||

  2586         !isBreakUnit(strsrch, *textoffset, end) ||

  2587         !checkIdentical(strsrch, *textoffset, end)) {

  2588         (*textoffset) --;

  2589         *textoffset = getPreviousBaseOffset(strsrch->search->text,

  2590                                             *textoffset);

  2591         return FALSE;

  2592     }

  2594     strsrch->search->matchedIndex  = *textoffset;

  2595     strsrch->search->matchedLength = end - *textoffset;

  2596     return TRUE;

  2597 }

  2598 #endif // #if BOYER_MOORE

  2600 // constructors and destructor -------------------------------------------

  2602 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,

  2603                                           int32_t         patternlength,

  2604                                     const UChar          *text,

  2605                                           int32_t         textlength,

  2606                                     const char           *locale,

  2607                                           UBreakIterator *breakiter,

  2608                                           UErrorCode     *status)

  2609 {

  2610     if (U_FAILURE(*status)) {

  2611         return NULL;

  2612     }

  2613 #if UCONFIG_NO_BREAK_ITERATION

  2614     if (breakiter != NULL) {

  2615         *status = U_UNSUPPORTED_ERROR;

  2616         return NULL;

  2617     }

  2618 #endif

  2619     if (locale) {

  2620         // ucol_open internally checks for status

  2621         UCollator     *collator = ucol_open(locale, status);

  2622         // pattern, text checks are done in usearch_openFromCollator

  2623         UStringSearch *result   = usearch_openFromCollator(pattern,

  2624                                               patternlength, text, textlength,

  2625                                               collator, breakiter, status);

  2627         if (result == NULL || U_FAILURE(*status)) {

  2628             if (collator) {

  2629                 ucol_close(collator);

  2630             }

  2631             return NULL;

  2632         }

  2633         else {

  2634             result->ownCollator = TRUE;

  2635         }

  2636         return result;

  2637     }

  2638     *status = U_ILLEGAL_ARGUMENT_ERROR;

  2639     return NULL;

  2640 }

  2642 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(

  2643                                   const UChar          *pattern,

  2644                                         int32_t         patternlength,

  2645                                   const UChar          *text,

  2646                                         int32_t         textlength,

  2647                                   const UCollator      *collator,

  2648                                         UBreakIterator *breakiter,

  2649                                         UErrorCode     *status)

  2650 {

  2651     if (U_FAILURE(*status)) {

  2652         return NULL;

  2653     }

  2654 #if UCONFIG_NO_BREAK_ITERATION

  2655     if (breakiter != NULL) {

  2656         *status = U_UNSUPPORTED_ERROR;

  2657         return NULL;

  2658     }

  2659 #endif

  2660     if (pattern == NULL || text == NULL || collator == NULL) {

  2661         *status = U_ILLEGAL_ARGUMENT_ERROR;

  2662         return NULL;

  2663     }

  2665     // string search does not really work when numeric collation is turned on

  2666     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {

  2667         *status = U_UNSUPPORTED_ERROR;

  2668         return NULL;

  2669     }

  2671     if (U_SUCCESS(*status)) {

  2672         initializeFCD(status);

  2673         if (U_FAILURE(*status)) {

  2674             return NULL;

  2675         }

  2677         UStringSearch *result;

  2678         if (textlength == -1) {

  2679             textlength = u_strlen(text);

  2680         }

  2681         if (patternlength == -1) {

  2682             patternlength = u_strlen(pattern);

  2683         }

  2684         if (textlength <= 0 || patternlength <= 0) {

  2685             *status = U_ILLEGAL_ARGUMENT_ERROR;

  2686             return NULL;

  2687         }

  2689         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));

  2690         if (result == NULL) {

  2691             *status = U_MEMORY_ALLOCATION_ERROR;

  2692             return NULL;

  2693         }

  2695         result->collator    = collator;

  2696         result->strength    = ucol_getStrength(collator);

  2697         result->ceMask      = getMask(result->strength);

  2698         result->toShift     =

  2699              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==

  2700                                                             UCOL_SHIFTED;

  2701         result->variableTop = ucol_getVariableTop(collator, status);

  2703         result->nfd         = Normalizer2Factory::getNFDInstance(*status);

  2705         if (U_FAILURE(*status)) {

  2706             uprv_free(result);

  2707             return NULL;

  2708         }

  2710         result->search             = (USearch *)uprv_malloc(sizeof(USearch));

  2711         if (result->search == NULL) {

  2712             *status = U_MEMORY_ALLOCATION_ERROR;

  2713             uprv_free(result);

  2714             return NULL;

  2715         }

  2717         result->search->text       = text;

  2718         result->search->textLength = textlength;

  2720         result->pattern.text       = pattern;

  2721         result->pattern.textLength = patternlength;

  2722         result->pattern.CE         = NULL;

  2723         result->pattern.PCE        = NULL;

  2725         result->search->breakIter  = breakiter;

  2726 #if !UCONFIG_NO_BREAK_ITERATION

  2727         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);

  2728         if (breakiter) {

  2729             ubrk_setText(breakiter, text, textlength, status);

  2730         }

  2731 #endif

  2733         result->ownCollator           = FALSE;

  2734         result->search->matchedLength = 0;

  2735         result->search->matchedIndex  = USEARCH_DONE;

  2736         result->utilIter              = NULL;

  2737         result->textIter              = ucol_openElements(collator, text,

  2738                                                           textlength, status);

  2739         if (U_FAILURE(*status)) {

  2740             usearch_close(result);

  2741             return NULL;

  2742         }

  2744         result->search->isOverlap          = FALSE;

  2745         result->search->isCanonicalMatch   = FALSE;

  2746         result->search->elementComparisonType = 0;

  2747         result->search->isForwardSearching = TRUE;

  2748         result->search->reset              = TRUE;

  2750         initialize(result, status);

  2752         if (U_FAILURE(*status)) {

  2753             usearch_close(result);

  2754             return NULL;

  2755         }

  2757         return result;

  2758     }

  2759     return NULL;

  2760 }

  2762 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)

  2763 {

  2764     if (strsrch) {

  2765         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&

  2766             strsrch->pattern.CE) {

  2767             uprv_free(strsrch->pattern.CE);

  2768         }

  2770         if (strsrch->pattern.PCE != NULL &&

  2771             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {

  2772             uprv_free(strsrch->pattern.PCE);

  2773         }

  2775         ucol_closeElements(strsrch->textIter);

  2776         ucol_closeElements(strsrch->utilIter);

  2778         if (strsrch->ownCollator && strsrch->collator) {

  2779             ucol_close((UCollator *)strsrch->collator);

  2780         }

  2782 #if !UCONFIG_NO_BREAK_ITERATION

  2783         if (strsrch->search->internalBreakIter) {

  2784             ubrk_close(strsrch->search->internalBreakIter);

  2785         }

  2786 #endif

  2788         uprv_free(strsrch->search);

  2789         uprv_free(strsrch);

  2790     }

  2791 }

  2793 // set and get methods --------------------------------------------------

  2795 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,

  2796                                         int32_t    position,

  2797                                         UErrorCode    *status)

  2798 {

  2799     if (U_SUCCESS(*status) && strsrch) {

  2800         if (isOutOfBounds(strsrch->search->textLength, position)) {

  2801             *status = U_INDEX_OUTOFBOUNDS_ERROR;

  2802         }

  2803         else {

  2804             setColEIterOffset(strsrch->textIter, position);

  2805         }

  2806         strsrch->search->matchedIndex  = USEARCH_DONE;

  2807         strsrch->search->matchedLength = 0;

  2808         strsrch->search->reset         = FALSE;

  2809     }

  2810 }

  2812 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)

  2813 {

  2814     if (strsrch) {

  2815         int32_t result = ucol_getOffset(strsrch->textIter);

  2816         if (isOutOfBounds(strsrch->search->textLength, result)) {

  2817             return USEARCH_DONE;

  2818         }

  2819         return result;

  2820     }

  2821     return USEARCH_DONE;

  2822 }

  2824 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,

  2825                                  USearchAttribute attribute,

  2826                                  USearchAttributeValue value,

  2827                                  UErrorCode *status)

  2828 {

  2829     if (U_SUCCESS(*status) && strsrch) {

  2830         switch (attribute)

  2831         {

  2832         case USEARCH_OVERLAP :

  2833             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);

  2834             break;

  2835         case USEARCH_CANONICAL_MATCH :

  2836             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :

  2837                                                                       FALSE);

  2838             break;

  2839         case USEARCH_ELEMENT_COMPARISON :

  2840             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {

  2841                 strsrch->search->elementComparisonType = (int16_t)value;

  2842             } else {

  2843                 strsrch->search->elementComparisonType = 0;

  2844             }

  2845             break;

  2846         case USEARCH_ATTRIBUTE_COUNT :

  2847         default:

  2848             *status = U_ILLEGAL_ARGUMENT_ERROR;

  2849         }

  2850     }

  2851     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {

  2852         *status = U_ILLEGAL_ARGUMENT_ERROR;

  2853     }

  2854 }

  2856 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(

  2857                                                 const UStringSearch *strsrch,

  2858                                                 USearchAttribute attribute)

  2859 {

  2860     if (strsrch) {

  2861         switch (attribute) {

  2862         case USEARCH_OVERLAP :

  2863             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :

  2864                                                         USEARCH_OFF);

  2865         case USEARCH_CANONICAL_MATCH :

  2866             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :

  2867                                                                USEARCH_OFF);

  2868         case USEARCH_ELEMENT_COMPARISON :

  2869             {

  2870                 int16_t value = strsrch->search->elementComparisonType;

  2871                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {

  2872                     return (USearchAttributeValue)value;

  2873                 } else {

  2874                     return USEARCH_STANDARD_ELEMENT_COMPARISON;

  2875                 }

  2876             }

  2877         case USEARCH_ATTRIBUTE_COUNT :

  2878             return USEARCH_DEFAULT;

  2879         }

  2880     }

  2881     return USEARCH_DEFAULT;

  2882 }

  2884 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(

  2885                                                 const UStringSearch *strsrch)

  2886 {

  2887     if (strsrch == NULL) {

  2888         return USEARCH_DONE;

  2889     }

  2890     return strsrch->search->matchedIndex;

  2891 }

  2894 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,

  2895                                             UChar         *result,

  2896                                             int32_t        resultCapacity,

  2897                                             UErrorCode    *status)

  2898 {

  2899     if (U_FAILURE(*status)) {

  2900         return USEARCH_DONE;

  2901     }

  2902     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&

  2903         result == NULL)) {

  2904         *status = U_ILLEGAL_ARGUMENT_ERROR;

  2905         return USEARCH_DONE;

  2906     }

  2908     int32_t     copylength = strsrch->search->matchedLength;

  2909     int32_t copyindex  = strsrch->search->matchedIndex;

  2910     if (copyindex == USEARCH_DONE) {

  2911         u_terminateUChars(result, resultCapacity, 0, status);

  2912         return USEARCH_DONE;

  2913     }

  2915     if (resultCapacity < copylength) {

  2916         copylength = resultCapacity;

  2917     }

  2918     if (copylength > 0) {

  2919         uprv_memcpy(result, strsrch->search->text + copyindex,

  2920                     copylength * sizeof(UChar));

  2921     }

  2922     return u_terminateUChars(result, resultCapacity,

  2923                              strsrch->search->matchedLength, status);

  2924 }

  2926 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(

  2927                                               const UStringSearch *strsrch)

  2928 {

  2929     if (strsrch) {

  2930         return strsrch->search->matchedLength;

  2931     }

  2932     return USEARCH_DONE;

  2933 }

  2935 #if !UCONFIG_NO_BREAK_ITERATION

  2937 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,

  2938                                                UBreakIterator *breakiter,

  2939                                                UErrorCode     *status)

  2940 {

  2941     if (U_SUCCESS(*status) && strsrch) {

  2942         strsrch->search->breakIter = breakiter;

  2943         if (breakiter) {

  2944             ubrk_setText(breakiter, strsrch->search->text,

  2945                          strsrch->search->textLength, status);

  2946         }

  2947     }

  2948 }

  2950 U_CAPI const UBreakIterator* U_EXPORT2

  2951 usearch_getBreakIterator(const UStringSearch *strsrch)

  2952 {

  2953     if (strsrch) {

  2954         return strsrch->search->breakIter;

  2955     }

  2956     return NULL;

  2957 }

  2959 #endif

  2961 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,

  2962                                       const UChar         *text,

  2963                                             int32_t        textlength,

  2964                                             UErrorCode    *status)

  2965 {

  2966     if (U_SUCCESS(*status)) {

  2967         if (strsrch == NULL || text == NULL || textlength < -1 ||

  2968             textlength == 0) {

  2969             *status = U_ILLEGAL_ARGUMENT_ERROR;

  2970         }

  2971         else {

  2972             if (textlength == -1) {

  2973                 textlength = u_strlen(text);

  2974             }

  2975             strsrch->search->text       = text;

  2976             strsrch->search->textLength = textlength;

  2977             ucol_setText(strsrch->textIter, text, textlength, status);

  2978             strsrch->search->matchedIndex  = USEARCH_DONE;

  2979             strsrch->search->matchedLength = 0;

  2980             strsrch->search->reset         = TRUE;

  2981 #if !UCONFIG_NO_BREAK_ITERATION

  2982             if (strsrch->search->breakIter != NULL) {

  2983                 ubrk_setText(strsrch->search->breakIter, text,

  2984                              textlength, status);

  2985             }

  2986             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);

  2987 #endif

  2988         }

  2989     }

  2990 }

  2992 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,

  2993                                                      int32_t       *length)

  2994 {

  2995     if (strsrch) {

  2996         *length = strsrch->search->textLength;

  2997         return strsrch->search->text;

  2998     }

  2999     return NULL;

  3000 }

  3002 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,

  3003                                           const UCollator     *collator,

  3004                                                 UErrorCode    *status)

  3005 {

  3006     if (U_SUCCESS(*status)) {

  3007         if (collator == NULL) {

  3008             *status = U_ILLEGAL_ARGUMENT_ERROR;

  3009             return;

  3010         }

  3012         if (strsrch) {

  3013             if (strsrch->ownCollator && (strsrch->collator != collator)) {

  3014                 ucol_close((UCollator *)strsrch->collator);

  3015                 strsrch->ownCollator = FALSE;

  3016             }

  3017             strsrch->collator    = collator;

  3018             strsrch->strength    = ucol_getStrength(collator);

  3019             strsrch->ceMask      = getMask(strsrch->strength);

  3020 #if !UCONFIG_NO_BREAK_ITERATION

  3021             ubrk_close(strsrch->search->internalBreakIter);

  3022             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),

  3023                                                      strsrch->search->text, strsrch->search->textLength, status);

  3024 #endif

  3025             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT

  3026             strsrch->toShift     =

  3027                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==

  3028                                                                 UCOL_SHIFTED;

  3029             // if status is a failure, ucol_getVariableTop returns 0

  3030             strsrch->variableTop = ucol_getVariableTop(collator, status);

  3031             if (U_SUCCESS(*status)) {

  3032                 initialize(strsrch, status);

  3033                 if (U_SUCCESS(*status)) {

  3034                     /* free offset buffer to avoid memory leak before initializing. */

  3035                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));

  3036                     uprv_init_collIterate(collator, strsrch->search->text,

  3037                                           strsrch->search->textLength,

  3038                                           &(strsrch->textIter->iteratordata_),

  3039                                           status);

  3040                     strsrch->utilIter->iteratordata_.coll = collator;

  3041                 }

  3042             }

  3043         }

  3045         // **** are these calls needed?

  3046         // **** we call uprv_init_pce in initializePatternPCETable

  3047         // **** and the CEBuffer constructor...

  3048 #if 0

  3049         uprv_init_pce(strsrch->textIter);

  3050         uprv_init_pce(strsrch->utilIter);

  3051 #endif

  3052     }

  3053 }

  3055 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)

  3056 {

  3057     if (strsrch) {

  3058         return (UCollator *)strsrch->collator;

  3059     }

  3060     return NULL;

  3061 }

  3063 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,

  3064                                          const UChar         *pattern,

  3065                                                int32_t        patternlength,

  3066                                                UErrorCode    *status)

  3067 {

  3068     if (U_SUCCESS(*status)) {

  3069         if (strsrch == NULL || pattern == NULL) {

  3070             *status = U_ILLEGAL_ARGUMENT_ERROR;

  3071         }

  3072         else {

  3073             if (patternlength == -1) {

  3074                 patternlength = u_strlen(pattern);

  3075             }

  3076             if (patternlength == 0) {

  3077                 *status = U_ILLEGAL_ARGUMENT_ERROR;

  3078                 return;

  3079             }

  3080             strsrch->pattern.text       = pattern;

  3081             strsrch->pattern.textLength = patternlength;

  3082             initialize(strsrch, status);

  3083         }

  3084     }

  3085 }

  3087 U_CAPI const UChar* U_EXPORT2

  3088 usearch_getPattern(const UStringSearch *strsrch,

  3089                    int32_t       *length)

  3090 {

  3091     if (strsrch) {

  3092         *length = strsrch->pattern.textLength;

  3093         return strsrch->pattern.text;

  3094     }

  3095     return NULL;

  3096 }

  3098 // miscellanous methods --------------------------------------------------

  3100 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,

  3101                                            UErrorCode    *status)

  3102 {

  3103     if (strsrch && U_SUCCESS(*status)) {

  3104         strsrch->search->isForwardSearching = TRUE;

  3105         usearch_setOffset(strsrch, 0, status);

  3106         if (U_SUCCESS(*status)) {

  3107             return usearch_next(strsrch, status);

  3108         }

  3109     }

  3110     return USEARCH_DONE;

  3111 }

  3113 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,

  3114                                                int32_t    position,

  3115                                                UErrorCode    *status)

  3116 {

  3117     if (strsrch && U_SUCCESS(*status)) {

  3118         strsrch->search->isForwardSearching = TRUE;

  3119         // position checked in usearch_setOffset

  3120         usearch_setOffset(strsrch, position, status);

  3121         if (U_SUCCESS(*status)) {

  3122             return usearch_next(strsrch, status);

  3123         }

  3124     }

  3125     return USEARCH_DONE;

  3126 }

  3128 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,

  3129                                           UErrorCode    *status)

  3130 {

  3131     if (strsrch && U_SUCCESS(*status)) {

  3132         strsrch->search->isForwardSearching = FALSE;

  3133         usearch_setOffset(strsrch, strsrch->search->textLength, status);

  3134         if (U_SUCCESS(*status)) {

  3135             return usearch_previous(strsrch, status);

  3136         }

  3137     }

  3138     return USEARCH_DONE;

  3139 }

  3141 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,

  3142                                                int32_t    position,

  3143                                                UErrorCode    *status)

  3144 {

  3145     if (strsrch && U_SUCCESS(*status)) {

  3146         strsrch->search->isForwardSearching = FALSE;

  3147         // position checked in usearch_setOffset

  3148         usearch_setOffset(strsrch, position, status);

  3149         if (U_SUCCESS(*status)) {

  3150             return usearch_previous(strsrch, status);

  3151         }

  3152     }

  3153     return USEARCH_DONE;

  3154 }

  3156 /**

  3157 * If a direction switch is required, we'll count the number of ces till the

  3158 * beginning of the collation element iterator and iterate forwards that

  3159 * number of times. This is so that we get to the correct point within the

  3160 * string to continue the search in. Imagine when we are in the middle of the

  3161 * normalization buffer when the change in direction is request. arrrgghh....

  3162 * After searching the offset within the collation element iterator will be

  3163 * shifted to the start of the match. If a match is not found, the offset would

  3164 * have been set to the end of the text string in the collation element

  3165 * iterator.

  3166 * Okay, here's my take on normalization buffer. The only time when there can

  3167 * be 2 matches within the same normalization is when the pattern is consists

  3168 * of all accents. But since the offset returned is from the text string, we

  3169 * should not confuse the caller by returning the second match within the

  3170 * same normalization buffer. If we do, the 2 results will have the same match

  3171 * offsets, and that'll be confusing. I'll return the next match that doesn't

  3172 * fall within the same normalization buffer. Note this does not affect the

  3173 * results of matches spanning the text and the normalization buffer.

  3174 * The position to start searching is taken from the collation element

  3175 * iterator. Callers of this API would have to set the offset in the collation

  3176 * element iterator before using this method.

  3177 */

  3178 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,

  3179                                           UErrorCode    *status)

  3180 {

  3181     if (U_SUCCESS(*status) && strsrch) {

  3182         // note offset is either equivalent to the start of the previous match

  3183         // or is set by the user

  3184         int32_t      offset       = usearch_getOffset(strsrch);

  3185         USearch     *search       = strsrch->search;

  3186         search->reset             = FALSE;

  3187         int32_t      textlength   = search->textLength;

  3188         if (search->isForwardSearching) {

  3189 #if BOYER_MOORE

  3190             if (offset == textlength

  3191                 || (!search->isOverlap &&

  3192                     (offset + strsrch->pattern.defaultShiftSize > textlength ||

  3193                     (search->matchedIndex != USEARCH_DONE &&

  3194                      offset + search->matchedLength >= textlength)))) {

  3195                 // not enough characters to match

  3196                 setMatchNotFound(strsrch);

  3197                 return USEARCH_DONE;

  3198             }

  3199 #else

  3200             if (offset == textlength ||

  3201                 (! search->isOverlap &&

  3202                 (search->matchedIndex != USEARCH_DONE &&

  3203                 offset + search->matchedLength > textlength))) {

  3204                     // not enough characters to match

  3205                     setMatchNotFound(strsrch);

  3206                     return USEARCH_DONE;

  3207             }

  3208 #endif

  3209         }

  3210         else {

  3211             // switching direction.

  3212             // if matchedIndex == USEARCH_DONE, it means that either a

  3213             // setOffset has been called or that previous ran off the text

  3214             // string. the iterator would have been set to offset 0 if a

  3215             // match is not found.

  3216             search->isForwardSearching = TRUE;

  3217             if (search->matchedIndex != USEARCH_DONE) {

  3218                 // there's no need to set the collation element iterator

  3219                 // the next call to next will set the offset.

  3220                 return search->matchedIndex;

  3221             }

  3222         }

  3224         if (U_SUCCESS(*status)) {

  3225             if (strsrch->pattern.CELength == 0) {

  3226                 if (search->matchedIndex == USEARCH_DONE) {

  3227                     search->matchedIndex = offset;

  3228                 }

  3229                 else { // moves by codepoints

  3230                     U16_FWD_1(search->text, search->matchedIndex, textlength);

  3231                 }

  3233                 search->matchedLength = 0;

  3234                 setColEIterOffset(strsrch->textIter, search->matchedIndex);

  3235                 // status checked below

  3236                 if (search->matchedIndex == textlength) {

  3237                     search->matchedIndex = USEARCH_DONE;

  3238                 }

  3239             }

  3240             else {

  3241                 if (search->matchedLength > 0) {

  3242                     // if matchlength is 0 we are at the start of the iteration

  3243                     if (search->isOverlap) {

  3244                         ucol_setOffset(strsrch->textIter, offset + 1, status);

  3245                     }

  3246                     else {

  3247                         ucol_setOffset(strsrch->textIter,

  3248                                        offset + search->matchedLength, status);

  3249                     }

  3250                 }

  3251                 else {

  3252                     // for boundary check purposes. this will ensure that the

  3253                     // next match will not preceed the current offset

  3254                     // note search->matchedIndex will always be set to something

  3255                     // in the code

  3256                     search->matchedIndex = offset - 1;

  3257                 }

  3259                 if (search->isCanonicalMatch) {

  3260                     // can't use exact here since extra accents are allowed.

  3261                     usearch_handleNextCanonical(strsrch, status);

  3262                 }

  3263                 else {

  3264                     usearch_handleNextExact(strsrch, status);

  3265                 }

  3266             }

  3268             if (U_FAILURE(*status)) {

  3269                 return USEARCH_DONE;

  3270             }

  3272 #if !BOYER_MOORE

  3273             if (search->matchedIndex == USEARCH_DONE) {

  3274                 ucol_setOffset(strsrch->textIter, search->textLength, status);

  3275             } else {

  3276                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);

  3277             }

  3278 #endif

  3280             return search->matchedIndex;

  3281         }

  3282     }

  3283     return USEARCH_DONE;

  3284 }

  3286 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,

  3287                                               UErrorCode *status)

  3288 {

  3289     if (U_SUCCESS(*status) && strsrch) {

  3290         int32_t offset;

  3291         USearch *search = strsrch->search;

  3292         if (search->reset) {

  3293             offset                     = search->textLength;

  3294             search->isForwardSearching = FALSE;

  3295             search->reset              = FALSE;

  3296             setColEIterOffset(strsrch->textIter, offset);

  3297         }

  3298         else {

  3299             offset = usearch_getOffset(strsrch);

  3300         }

  3302         int32_t matchedindex = search->matchedIndex;

  3303         if (search->isForwardSearching == TRUE) {

  3304             // switching direction.

  3305             // if matchedIndex == USEARCH_DONE, it means that either a

  3306             // setOffset has been called or that next ran off the text

  3307             // string. the iterator would have been set to offset textLength if

  3308             // a match is not found.

  3309             search->isForwardSearching = FALSE;

  3310             if (matchedindex != USEARCH_DONE) {

  3311                 return matchedindex;

  3312             }

  3313         }

  3314         else {

  3315 #if BOYER_MOORE

  3316             if (offset == 0 || matchedindex == 0 ||

  3317                 (!search->isOverlap &&

  3318                     (offset < strsrch->pattern.defaultShiftSize ||

  3319                     (matchedindex != USEARCH_DONE &&

  3320                     matchedindex < strsrch->pattern.defaultShiftSize)))) {

  3321                 // not enough characters to match

  3322                 setMatchNotFound(strsrch);

  3323                 return USEARCH_DONE;

  3324             }

  3325 #else

  3326             // Could check pattern length, but the

  3327             // linear search will do the right thing

  3328             if (offset == 0 || matchedindex == 0) {

  3329                 setMatchNotFound(strsrch);

  3330                 return USEARCH_DONE;

  3331             }

  3332 #endif

  3333         }

  3335         if (U_SUCCESS(*status)) {

  3336             if (strsrch->pattern.CELength == 0) {

  3337                 search->matchedIndex =

  3338                       (matchedindex == USEARCH_DONE ? offset : matchedindex);

  3339                 if (search->matchedIndex == 0) {

  3340                     setMatchNotFound(strsrch);

  3341                     // status checked below

  3342                 }

  3343                 else { // move by codepoints

  3344                     U16_BACK_1(search->text, 0, search->matchedIndex);

  3345                     setColEIterOffset(strsrch->textIter, search->matchedIndex);

  3346                     // status checked below

  3347                     search->matchedLength = 0;

  3348                 }

  3349             }

  3350             else {

  3351                 if (strsrch->search->isCanonicalMatch) {

  3352                     // can't use exact here since extra accents are allowed.

  3353                     usearch_handlePreviousCanonical(strsrch, status);

  3354                     // status checked below

  3355                 }

  3356                 else {

  3357                     usearch_handlePreviousExact(strsrch, status);

  3358                     // status checked below

  3359                 }

  3360             }

  3362             if (U_FAILURE(*status)) {

  3363                 return USEARCH_DONE;

  3364             }

  3366             return search->matchedIndex;

  3367         }

  3368     }

  3369     return USEARCH_DONE;

  3370 }

  3374 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)

  3375 {

  3376     /*

  3377     reset is setting the attributes that are already in

  3378     string search, hence all attributes in the collator should

  3379     be retrieved without any problems

  3380     */

  3381     if (strsrch) {

  3382         UErrorCode status            = U_ZERO_ERROR;

  3383         UBool      sameCollAttribute = TRUE;

  3384         uint32_t   ceMask;

  3385         UBool      shift;

  3386         uint32_t   varTop;

  3388         // **** hack to deal w/ how processed CEs encode quaternary ****

  3389         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);

  3390         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||

  3391             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {

  3392                 sameCollAttribute = FALSE;

  3393         }

  3395         strsrch->strength    = ucol_getStrength(strsrch->collator);

  3396         ceMask = getMask(strsrch->strength);

  3397         if (strsrch->ceMask != ceMask) {

  3398             strsrch->ceMask = ceMask;

  3399             sameCollAttribute = FALSE;

  3400         }

  3402         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT

  3403         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,

  3404                                   &status) == UCOL_SHIFTED;

  3405         if (strsrch->toShift != shift) {

  3406             strsrch->toShift  = shift;

  3407             sameCollAttribute = FALSE;

  3408         }

  3410         // if status is a failure, ucol_getVariableTop returns 0

  3411         varTop = ucol_getVariableTop(strsrch->collator, &status);

  3412         if (strsrch->variableTop != varTop) {

  3413             strsrch->variableTop = varTop;

  3414             sameCollAttribute    = FALSE;

  3415         }

  3416         if (!sameCollAttribute) {

  3417             initialize(strsrch, &status);

  3418         }

  3419         /* free offset buffer to avoid memory leak before initializing. */

  3420         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));

  3421         uprv_init_collIterate(strsrch->collator, strsrch->search->text,

  3422                               strsrch->search->textLength,

  3423                               &(strsrch->textIter->iteratordata_),

  3424                               &status);

  3425         strsrch->search->matchedLength      = 0;

  3426         strsrch->search->matchedIndex       = USEARCH_DONE;

  3427         strsrch->search->isOverlap          = FALSE;

  3428         strsrch->search->isCanonicalMatch   = FALSE;

  3429         strsrch->search->elementComparisonType = 0;

  3430         strsrch->search->isForwardSearching = TRUE;

  3431         strsrch->search->reset              = TRUE;

  3432     }

  3433 }

  3435 //

  3436 //  CEI  Collation Element + source text index.

  3437 //       These structs are kept in the circular buffer.

  3438 //

  3439 struct  CEI {

  3440     int64_t ce;

  3441     int32_t lowIndex;

  3442     int32_t highIndex;

  3443 };

  3445 U_NAMESPACE_BEGIN

  3448 //

  3449 //  CEBuffer   A circular buffer of CEs from the text being searched.

  3450 //

  3451 #define   DEFAULT_CEBUFFER_SIZE 96

  3452 #define   CEBUFFER_EXTRA 32

  3453 // Some typical max values to make buffer size more reasonable for asymmetric search.

  3454 // #8694 is for a better long-term solution to allocation of this buffer.

  3455 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8

  3456 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3

  3457 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))

  3458 struct CEBuffer {

  3459     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];

  3460     CEI                 *buf;

  3461     int32_t              bufSize;

  3462     int32_t              firstIx;

  3463     int32_t              limitIx;

  3464     UCollationElements  *ceIter;

  3465     UStringSearch       *strSearch;

  3469                CEBuffer(UStringSearch *ss, UErrorCode *status);

  3470                ~CEBuffer();

  3471    const CEI   *get(int32_t index);

  3472    const CEI   *getPrevious(int32_t index);

  3473 };

  3476 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {

  3477     buf = defBuf;

  3478     strSearch = ss;

  3479     bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;

  3480     if (ss->search->elementComparisonType != 0) {

  3481         const UChar * patText = ss->pattern.text;

  3482         if (patText) {

  3483             const UChar * patTextLimit = patText + ss->pattern.textLength;

  3484             while ( patText < patTextLimit ) {

  3485                 UChar c = *patText++;

  3486                 if (MIGHT_BE_JAMO_L(c)) {

  3487                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;

  3488                 } else {

  3489                     // No check for surrogates, we might allocate slightly more buffer than necessary.

  3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;

  3491                 }

  3492             }

  3493         }

  3494     }

  3495     ceIter    = ss->textIter;

  3496     firstIx = 0;

  3497     limitIx = 0;

  3499     uprv_init_pce(ceIter);

  3501     if (bufSize>DEFAULT_CEBUFFER_SIZE) {

  3502         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));

  3503         if (buf == NULL) {

  3504             *status = U_MEMORY_ALLOCATION_ERROR;

  3505         }

  3506     }

  3507 }

  3509 // TODO: add a reset or init function so that allocated

  3510 //       buffers can be retained & reused.

  3512 CEBuffer::~CEBuffer() {

  3513     if (buf != defBuf) {

  3514         uprv_free(buf);

  3515     }

  3516 }

  3519 // Get the CE with the specified index.

  3520 //   Index must be in the range

  3521 //          n-history_size < index < n+1

  3522 //   where n is the largest index to have been fetched by some previous call to this function.

  3523 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.

  3524 //

  3525 const CEI *CEBuffer::get(int32_t index) {

  3526     int i = index % bufSize;

  3528     if (index>=firstIx && index<limitIx) {

  3529         // The request was for an entry already in our buffer.

  3530         //  Just return it.

  3531         return &buf[i];

  3532     }

  3534     // Caller is requesting a new, never accessed before, CE.

  3535     //   Verify that it is the next one in sequence, which is all

  3536     //   that is allowed.

  3537     if (index != limitIx) {

  3538         U_ASSERT(FALSE);

  3540         return NULL;

  3541     }

  3543     // Manage the circular CE buffer indexing

  3544     limitIx++;

  3546     if (limitIx - firstIx >= bufSize) {

  3547         // The buffer is full, knock out the lowest-indexed entry.

  3548         firstIx++;

  3549     }

  3551     UErrorCode status = U_ZERO_ERROR;

  3553     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);

  3555     return &buf[i];

  3556 }

  3558 // Get the CE with the specified index.

  3559 //   Index must be in the range

  3560 //          n-history_size < index < n+1

  3561 //   where n is the largest index to have been fetched by some previous call to this function.

  3562 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.

  3563 //

  3564 const CEI *CEBuffer::getPrevious(int32_t index) {

  3565     int i = index % bufSize;

  3567     if (index>=firstIx && index<limitIx) {

  3568         // The request was for an entry already in our buffer.

  3569         //  Just return it.

  3570         return &buf[i];

  3571     }

  3573     // Caller is requesting a new, never accessed before, CE.

  3574     //   Verify that it is the next one in sequence, which is all

  3575     //   that is allowed.

  3576     if (index != limitIx) {

  3577         U_ASSERT(FALSE);

  3579         return NULL;

  3580     }

  3582     // Manage the circular CE buffer indexing

  3583     limitIx++;

  3585     if (limitIx - firstIx >= bufSize) {

  3586         // The buffer is full, knock out the lowest-indexed entry.

  3587         firstIx++;

  3588     }

  3590     UErrorCode status = U_ZERO_ERROR;

  3592     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);

  3594     return &buf[i];

  3595 }

  3597 U_NAMESPACE_END

  3600 // #define USEARCH_DEBUG

  3602 #ifdef USEARCH_DEBUG

  3603 #include <stdio.h>

  3604 #include <stdlib.h>

  3605 #endif

  3607 /*

  3608  * Find the next break boundary after startIndex. If the UStringSearch object

  3609  * has an external break iterator, use that. Otherwise use the internal character

  3610  * break iterator.

  3611  */

  3612 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {

  3613 #if 0

  3614     const UChar *text = strsrch->search->text;

  3615     int32_t textLen   = strsrch->search->textLength;

  3617     U_ASSERT(startIndex>=0);

  3618     U_ASSERT(startIndex<=textLen);

  3620     if (startIndex >= textLen) {

  3621         return startIndex;

  3622     }

  3624     UChar32  c;

  3625     int32_t  i = startIndex;

  3626     U16_NEXT(text, i, textLen, c);

  3628     // If we are on a control character, stop without looking for combining marks.

  3629     //    Control characters do not combine.

  3630     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

  3631     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {

  3632         return i;

  3633     }

  3635     // The initial character was not a control, and can thus accept trailing

  3636     //   combining characters.  Advance over however many of them there are.

  3637     int32_t  indexOfLastCharChecked;

  3638     for (;;) {

  3639         indexOfLastCharChecked = i;

  3640         if (i>=textLen) {

  3641             break;

  3642         }

  3643         U16_NEXT(text, i, textLen, c);

  3644         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

  3645         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {

  3646             break;

  3647         }

  3648     }

  3649     return indexOfLastCharChecked;

  3650 #elif !UCONFIG_NO_BREAK_ITERATION

  3651     UBreakIterator *breakiterator = strsrch->search->breakIter;

  3653     if (breakiterator == NULL) {

  3654         breakiterator = strsrch->search->internalBreakIter;

  3655     }

  3657     if (breakiterator != NULL) {

  3658         return ubrk_following(breakiterator, startIndex);

  3659     }

  3661     return startIndex;

  3662 #else

  3663     // **** or should we use the original code? ****

  3664     return startIndex;

  3665 #endif

  3667 }

  3669 /*

  3670  * Returns TRUE if index is on a break boundary. If the UStringSearch

  3671  * has an external break iterator, test using that, otherwise test

  3672  * using the internal character break iterator.

  3673  */

  3674 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {

  3675 #if 0

  3676     const UChar *text = strsrch->search->text;

  3677     int32_t textLen   = strsrch->search->textLength;

  3679     U_ASSERT(index>=0);

  3680     U_ASSERT(index<=textLen);

  3682     if (index>=textLen || index<=0) {

  3683         return TRUE;

  3684     }

  3686     // If the character at the current index is not a GRAPHEME_EXTEND

  3687     //    then we can not be within a combining sequence.

  3688     UChar32  c;

  3689     U16_GET(text, 0, index, textLen, c);

  3690     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

  3691     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {

  3692         return TRUE;

  3693     }

  3695     // We are at a combining mark.  If the preceding character is anything

  3696     //   except a CONTROL, CR or LF, we are in a combining sequence.

  3697     U16_PREV(text, 0, index, c);

  3698     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);

  3699     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);

  3700     return !combining;

  3701 #elif !UCONFIG_NO_BREAK_ITERATION

  3702     UBreakIterator *breakiterator = strsrch->search->breakIter;

  3704     if (breakiterator == NULL) {

  3705         breakiterator = strsrch->search->internalBreakIter;

  3706     }

  3708     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));

  3709 #else

  3710     // **** or use the original code? ****

  3711     return TRUE;

  3712 #endif

  3713 }

  3715 #if 0

  3716 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)

  3717 {

  3718 #if !UCONFIG_NO_BREAK_ITERATION

  3719     UBreakIterator *breakiterator = strsrch->search->breakIter;

  3721     if (breakiterator != NULL) {

  3722         int32_t startindex = ubrk_first(breakiterator);

  3723         int32_t endindex   = ubrk_last(breakiterator);

  3725         // out-of-range indexes are never boundary positions

  3726         if (start < startindex || start > endindex ||

  3727             end < startindex || end > endindex) {

  3728             return FALSE;

  3729         }

  3731         return ubrk_isBoundary(breakiterator, start) &&

  3732                ubrk_isBoundary(breakiterator, end);

  3733     }

  3734 #endif

  3736     return TRUE;

  3737 }

  3738 #endif

  3740 typedef enum {

  3741     U_CE_MATCH = -1,

  3742     U_CE_NO_MATCH = 0,

  3743     U_CE_SKIP_TARG,

  3744     U_CE_SKIP_PATN

  3745 } UCompareCEsResult;

  3746 #define U_CE_LEVEL2_BASE 0x00000005

  3747 #define U_CE_LEVEL3_BASE 0x00050000

  3749 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {

  3750     if (targCE == patCE) {

  3751         return U_CE_MATCH;

  3752     }

  3753     if (compareType == 0) {

  3754         return U_CE_NO_MATCH;

  3755     }

  3757     int64_t targCEshifted = targCE >> 32;

  3758     int64_t patCEshifted = patCE >> 32;

  3759     int64_t mask;

  3761     mask = 0xFFFF0000;

  3762     int32_t targLev1 = (int32_t)(targCEshifted & mask);

  3763     int32_t patLev1 = (int32_t)(patCEshifted & mask);

  3764     if ( targLev1 != patLev1 ) {

  3765         if ( targLev1 == 0 ) {

  3766             return U_CE_SKIP_TARG;

  3767         }

  3768         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {

  3769             return U_CE_SKIP_PATN;

  3770         }

  3771         return U_CE_NO_MATCH;

  3772     }

  3774     mask = 0x0000FFFF;

  3775     int32_t targLev2 = (int32_t)(targCEshifted & mask);

  3776     int32_t patLev2 = (int32_t)(patCEshifted & mask);

  3777     if ( targLev2 != patLev2 ) {

  3778         if ( targLev2 == 0 ) {

  3779             return U_CE_SKIP_TARG;

  3780         }

  3781         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {

  3782             return U_CE_SKIP_PATN;

  3783         }

  3784         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?

  3785             U_CE_MATCH: U_CE_NO_MATCH;

  3786     }

  3788     mask = 0xFFFF0000;

  3789     int32_t targLev3 = (int32_t)(targCE & mask);

  3790     int32_t patLev3 = (int32_t)(patCE & mask);

  3791     if ( targLev3 != patLev3 ) {

  3792         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?

  3793             U_CE_MATCH: U_CE_NO_MATCH;

  3794    }

  3796     return U_CE_MATCH;

  3797 }

  3799 #if BOYER_MOORE

  3800 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s

  3801 #endif

  3803 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,

  3804                                        int32_t        startIdx,

  3805                                        int32_t        *matchStart,

  3806                                        int32_t        *matchLimit,

  3807                                        UErrorCode     *status)

  3808 {

  3809     if (U_FAILURE(*status)) {

  3810         return FALSE;

  3811     }

  3813     // TODO:  reject search patterns beginning with a combining char.

  3815 #ifdef USEARCH_DEBUG

  3816     if (getenv("USEARCH_DEBUG") != NULL) {

  3817         printf("Pattern CEs\n");

  3818         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {

  3819             printf(" %8x", strsrch->pattern.CE[ii]);

  3820         }

  3821         printf("\n");

  3822     }

  3824 #endif

  3825     // Input parameter sanity check.

  3826     //  TODO:  should input indicies clip to the text length

  3827     //         in the same way that UText does.

  3828     if(strsrch->pattern.CELength == 0         ||

  3829        startIdx < 0                           ||

  3830        startIdx > strsrch->search->textLength ||

  3831        strsrch->pattern.CE == NULL) {

  3832            *status = U_ILLEGAL_ARGUMENT_ERROR;

  3833            return FALSE;

  3834     }

  3836     if (strsrch->pattern.PCE == NULL) {

  3837         initializePatternPCETable(strsrch, status);

  3838     }

  3840     ucol_setOffset(strsrch->textIter, startIdx, status);

  3841     CEBuffer ceb(strsrch, status);

  3844     int32_t    targetIx = 0;

  3845     const CEI *targetCEI = NULL;

  3846     int32_t    patIx;

  3847     UBool      found;

  3849     int32_t  mStart = -1;

  3850     int32_t  mLimit = -1;

  3851     int32_t  minLimit;

  3852     int32_t  maxLimit;

  3856     // Outer loop moves over match starting positions in the

  3857     //      target CE space.

  3858     // Here we see the target as a sequence of collation elements, resulting from the following:

  3859     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied

  3860     //    (for example, digraphs such as IJ may be broken into two characters).

  3861     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next

  3862     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these

  3863     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t

  3864     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),

  3865     //    then the CE is deleted, so the following code sees only CEs that are relevant.

  3866     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.

  3867     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text

  3868     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).

  3869     //

  3870     for(targetIx=0; ; targetIx++)

  3871     {

  3872         found = TRUE;

  3873         //  Inner loop checks for a match beginning at each

  3874         //  position from the outer loop.

  3875         int32_t targetIxOffset = 0;

  3876         int64_t patCE = 0;

  3877         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer

  3878         // (compared to the last CE fetched for the previous targetIx value) as we need to go

  3879         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.

  3880         const CEI *firstCEI = ceb.get(targetIx);

  3881         if (firstCEI == NULL) {

  3882             *status = U_INTERNAL_PROGRAM_ERROR;

  3883             found = FALSE;

  3884             break;

  3885         }

  3887         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {

  3888             patCE = strsrch->pattern.PCE[patIx];

  3889             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);

  3890             //  Compare CE from target string with CE from the pattern.

  3891             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,

  3892             //    which will fail the compare, below.

  3893             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);

  3894             if ( ceMatch == U_CE_NO_MATCH ) {

  3895                 found = FALSE;

  3896                 break;

  3897             } else if ( ceMatch > U_CE_NO_MATCH ) {

  3898                 if ( ceMatch == U_CE_SKIP_TARG ) {

  3899                     // redo with same patCE, next targCE

  3900                     patIx--;

  3901                     targetIxOffset++;

  3902                 } else { // ceMatch == U_CE_SKIP_PATN

  3903                     // redo with same targCE, next patCE

  3904                     targetIxOffset--;

  3905                 }

  3906             }

  3907         }

  3908         targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far

  3910         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {

  3911             // No match at this targetIx.  Try again at the next.

  3912             continue;

  3913         }

  3915         if (!found) {

  3916             // No match at all, we have run off the end of the target text.

  3917             break;

  3918         }

  3921         // We have found a match in CE space.

  3922         // Now determine the bounds in string index space.

  3923         //  There still is a chance of match failure if the CE range not correspond to

  3924         //     an acceptable character range.

  3925         //

  3926         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);

  3928         mStart   = firstCEI->lowIndex;

  3929         minLimit = lastCEI->lowIndex;

  3931         // Look at the CE following the match.  If it is UCOL_NULLORDER the match

  3932         //   extended to the end of input, and the match is good.

  3934         // Look at the high and low indices of the CE following the match. If

  3935         // they are the same it means one of two things:

  3936         //    1. The match extended to the last CE from the target text, which is OK, or

  3937         //    2. The last CE that was part of the match is in an expansion that extends

  3938         //       to the first CE after the match. In this case, we reject the match.

  3939         const CEI *nextCEI = 0;

  3940         if (strsrch->search->elementComparisonType == 0) {

  3941             nextCEI  = ceb.get(targetIx + targetIxOffset);

  3942             maxLimit = nextCEI->lowIndex;

  3943             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {

  3944                 found = FALSE;

  3945             }

  3946         } else {

  3947             for ( ; ; ++targetIxOffset ) {

  3948                 nextCEI = ceb.get(targetIx + targetIxOffset);

  3949                 maxLimit = nextCEI->lowIndex;

  3950                 // If we are at the end of the target too, match succeeds

  3951                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {

  3952                     break;

  3953                 }

  3954                 // As long as the next CE has primary weight of 0,

  3955                 // it is part of the last target element matched by the pattern;

  3956                 // make sure it can be part of a match with the last patCE

  3957                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {

  3958                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);

  3959                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {

  3960                         found = FALSE;

  3961                         break;

  3962                     }

  3963                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched

  3964                 // target element, but it has non-zero primary weight => match fails

  3965                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {

  3966                     found = false;

  3967                     break;

  3968                 // Else the target CE is not part of an expansion of the last matched element, match succeeds

  3969                 } else {

  3970                     break;

  3971                 }

  3972             }

  3973         }

  3976         // Check for the start of the match being within a combining sequence.

  3977         //   This can happen if the pattern itself begins with a combining char, and

  3978         //   the match found combining marks in the target text that were attached

  3979         //    to something else.

  3980         //   This type of match should be rejected for not completely consuming a

  3981         //   combining sequence.

  3982         if (!isBreakBoundary(strsrch, mStart)) {

  3983             found = FALSE;

  3984         }

  3986         // Check for the start of the match being within an Collation Element Expansion,

  3987         //   meaning that the first char of the match is only partially matched.

  3988         //   With exapnsions, the first CE will report the index of the source

  3989         //   character, and all subsequent (expansions) CEs will report the source index of the

  3990         //    _following_ character.

  3991         int32_t secondIx = firstCEI->highIndex;

  3992         if (mStart == secondIx) {

  3993             found = FALSE;

  3994         }

  3996         //  Advance the match end position to the first acceptable match boundary.

  3997         //    This advances the index over any combining charcters.

  3998         mLimit = maxLimit;

  3999         if (minLimit < maxLimit) {

  4000             // When the last CE's low index is same with its high index, the CE is likely

  4001             // a part of expansion. In this case, the index is located just after the

  4002             // character corresponding to the CEs compared above. If the index is right

  4003             // at the break boundary, move the position to the next boundary will result

  4004             // incorrect match length when there are ignorable characters exist between

  4005             // the position and the next character produces CE(s). See ticket#8482.

  4006             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {

  4007                 mLimit = minLimit;

  4008             } else {

  4009                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

  4010                 if (nba >= lastCEI->highIndex) {

  4011                     mLimit = nba;

  4012                 }

  4013             }

  4014         }

  4016     #ifdef USEARCH_DEBUG

  4017         if (getenv("USEARCH_DEBUG") != NULL) {

  4018             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);

  4019         }

  4020     #endif

  4022         // If advancing to the end of a combining sequence in character indexing space

  4023         //   advanced us beyond the end of the match in CE space, reject this match.

  4024         if (mLimit > maxLimit) {

  4025             found = FALSE;

  4026         }

  4028         if (!isBreakBoundary(strsrch, mLimit)) {

  4029             found = FALSE;

  4030         }

  4032         if (! checkIdentical(strsrch, mStart, mLimit)) {

  4033             found = FALSE;

  4034         }

  4036         if (found) {

  4037             break;

  4038         }

  4039     }

  4041     #ifdef USEARCH_DEBUG

  4042     if (getenv("USEARCH_DEBUG") != NULL) {

  4043         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);

  4044         int32_t  lastToPrint = ceb.limitIx+2;

  4045         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {

  4046             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);

  4047         }

  4048         printf("\n%s\n", found? "match found" : "no match");

  4049     }

  4050     #endif

  4052     // All Done.  Store back the match bounds to the caller.

  4053     //

  4054     if (found==FALSE) {

  4055         mLimit = -1;

  4056         mStart = -1;

  4057     }

  4059     if (matchStart != NULL) {

  4060         *matchStart= mStart;

  4061     }

  4063     if (matchLimit != NULL) {

  4064         *matchLimit = mLimit;

  4065     }

  4067     return found;

  4068 }

  4070 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,

  4071                                                 int32_t        startIdx,

  4072                                                 int32_t        *matchStart,

  4073                                                 int32_t        *matchLimit,

  4074                                                 UErrorCode     *status)

  4075 {

  4076     if (U_FAILURE(*status)) {

  4077         return FALSE;

  4078     }

  4080     // TODO:  reject search patterns beginning with a combining char.

  4082 #ifdef USEARCH_DEBUG

  4083     if (getenv("USEARCH_DEBUG") != NULL) {

  4084         printf("Pattern CEs\n");

  4085         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {

  4086             printf(" %8x", strsrch->pattern.CE[ii]);

  4087         }

  4088         printf("\n");

  4089     }

  4091 #endif

  4092     // Input parameter sanity check.

  4093     //  TODO:  should input indicies clip to the text length

  4094     //         in the same way that UText does.

  4095     if(strsrch->pattern.CELength == 0         ||

  4096        startIdx < 0                           ||

  4097        startIdx > strsrch->search->textLength ||

  4098        strsrch->pattern.CE == NULL) {

  4099            *status = U_ILLEGAL_ARGUMENT_ERROR;

  4100            return FALSE;

  4101     }

  4103     if (strsrch->pattern.PCE == NULL) {

  4104         initializePatternPCETable(strsrch, status);

  4105     }

  4107     CEBuffer ceb(strsrch, status);

  4108     int32_t    targetIx = 0;

  4110     /*

  4111      * Pre-load the buffer with the CE's for the grapheme

  4112      * after our starting position so that we're sure that

  4113      * we can look at the CE following the match when we

  4114      * check the match boundaries.

  4115      *

  4116      * This will also pre-fetch the first CE that we'll

  4117      * consider for the match.

  4118      */

  4119     if (startIdx < strsrch->search->textLength) {

  4120         UBreakIterator *bi = strsrch->search->internalBreakIter;

  4121         int32_t next = ubrk_following(bi, startIdx);

  4123         ucol_setOffset(strsrch->textIter, next, status);

  4125         for (targetIx = 0; ; targetIx += 1) {

  4126             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {

  4127                 break;

  4128             }

  4129         }

  4130     } else {

  4131         ucol_setOffset(strsrch->textIter, startIdx, status);

  4132     }

  4135     const CEI *targetCEI = NULL;

  4136     int32_t    patIx;

  4137     UBool      found;

  4139     int32_t  limitIx = targetIx;

  4140     int32_t  mStart = -1;

  4141     int32_t  mLimit = -1;

  4142     int32_t  minLimit;

  4143     int32_t  maxLimit;

  4147     // Outer loop moves over match starting positions in the

  4148     //      target CE space.

  4149     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).

  4150     // But  patIx is 0 at the beginning of the pattern and increases toward the end.

  4151     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern

  4152     // and the beginning of the base text.

  4153     for(targetIx = limitIx; ; targetIx += 1)

  4154     {

  4155         found = TRUE;

  4156         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer

  4157         // (compared to the last CE fetched for the previous targetIx value) as we need to go

  4158         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.

  4159         const CEI *lastCEI  = ceb.getPrevious(targetIx);

  4160         if (lastCEI == NULL) {

  4161             *status = U_INTERNAL_PROGRAM_ERROR;

  4162             found = FALSE;

  4163              break;

  4164         }

  4165         //  Inner loop checks for a match beginning at each

  4166         //  position from the outer loop.

  4167         int32_t targetIxOffset = 0;

  4168         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {

  4169             int64_t patCE = strsrch->pattern.PCE[patIx];

  4171             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);

  4172             //  Compare CE from target string with CE from the pattern.

  4173             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,

  4174             //    which will fail the compare, below.

  4175             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);

  4176             if ( ceMatch == U_CE_NO_MATCH ) {

  4177                 found = FALSE;

  4178                 break;

  4179             } else if ( ceMatch > U_CE_NO_MATCH ) {

  4180                 if ( ceMatch == U_CE_SKIP_TARG ) {

  4181                     // redo with same patCE, next targCE

  4182                     patIx++;

  4183                     targetIxOffset++;

  4184                 } else { // ceMatch == U_CE_SKIP_PATN

  4185                     // redo with same targCE, next patCE

  4186                     targetIxOffset--;

  4187                 }

  4188             }

  4189         }

  4191         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {

  4192             // No match at this targetIx.  Try again at the next.

  4193             continue;

  4194         }

  4196         if (!found) {

  4197             // No match at all, we have run off the end of the target text.

  4198             break;

  4199         }

  4202         // We have found a match in CE space.

  4203         // Now determine the bounds in string index space.

  4204         //  There still is a chance of match failure if the CE range not correspond to

  4205         //     an acceptable character range.

  4206         //

  4207         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);

  4208         mStart   = firstCEI->lowIndex;

  4210         // Check for the start of the match being within a combining sequence.

  4211         //   This can happen if the pattern itself begins with a combining char, and

  4212         //   the match found combining marks in the target text that were attached

  4213         //    to something else.

  4214         //   This type of match should be rejected for not completely consuming a

  4215         //   combining sequence.

  4216         if (!isBreakBoundary(strsrch, mStart)) {

  4217             found = FALSE;

  4218         }

  4220         // Look at the high index of the first CE in the match. If it's the same as the

  4221         // low index, the first CE in the match is in the middle of an expansion.

  4222         if (mStart == firstCEI->highIndex) {

  4223             found = FALSE;

  4224         }

  4227         minLimit = lastCEI->lowIndex;

  4229         if (targetIx > 0) {

  4230             // Look at the CE following the match.  If it is UCOL_NULLORDER the match

  4231             //   extended to the end of input, and the match is good.

  4233             // Look at the high and low indices of the CE following the match. If

  4234             // they are the same it means one of two things:

  4235             //    1. The match extended to the last CE from the target text, which is OK, or

  4236             //    2. The last CE that was part of the match is in an expansion that extends

  4237             //       to the first CE after the match. In this case, we reject the match.

  4238             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);

  4240             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {

  4241                 found = FALSE;

  4242             }

  4244             mLimit = maxLimit = nextCEI->lowIndex;

  4246             //  Advance the match end position to the first acceptable match boundary.

  4247             //    This advances the index over any combining charcters.

  4248             if (minLimit < maxLimit) {

  4249                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);

  4251                 if (nba >= lastCEI->highIndex) {

  4252                     mLimit = nba;

  4253                 }

  4254             }

  4256             // If advancing to the end of a combining sequence in character indexing space

  4257             //   advanced us beyond the end of the match in CE space, reject this match.

  4258             if (mLimit > maxLimit) {

  4259                 found = FALSE;

  4260             }

  4262             // Make sure the end of the match is on a break boundary

  4263             if (!isBreakBoundary(strsrch, mLimit)) {

  4264                 found = FALSE;

  4265             }

  4267         } else {

  4268             // No non-ignorable CEs after this point.

  4269             // The maximum position is detected by boundary after

  4270             // the last non-ignorable CE. Combining sequence

  4271             // across the start index will be truncated.

  4272             int32_t nba = nextBoundaryAfter(strsrch, minLimit);

  4273             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;

  4274         }

  4276     #ifdef USEARCH_DEBUG

  4277         if (getenv("USEARCH_DEBUG") != NULL) {

  4278             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);

  4279         }

  4280     #endif

  4283         if (! checkIdentical(strsrch, mStart, mLimit)) {

  4284             found = FALSE;

  4285         }

  4287         if (found) {

  4288             break;

  4289         }

  4290     }

  4292     #ifdef USEARCH_DEBUG

  4293     if (getenv("USEARCH_DEBUG") != NULL) {

  4294         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);

  4295         int32_t  lastToPrint = ceb.limitIx+2;

  4296         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {

  4297             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);

  4298         }

  4299         printf("\n%s\n", found? "match found" : "no match");

  4300     }

  4301     #endif

  4303     // All Done.  Store back the match bounds to the caller.

  4304     //

  4305     if (found==FALSE) {

  4306         mLimit = -1;

  4307         mStart = -1;

  4308     }

  4310     if (matchStart != NULL) {

  4311         *matchStart= mStart;

  4312     }

  4314     if (matchLimit != NULL) {

  4315         *matchLimit = mLimit;

  4316     }

  4318     return found;

  4319 }

  4321 // internal use methods declared in usrchimp.h -----------------------------

  4323 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)

  4324 {

  4325     if (U_FAILURE(*status)) {

  4326         setMatchNotFound(strsrch);

  4327         return FALSE;

  4328     }

  4330 #if BOYER_MOORE

  4331     UCollationElements *coleiter        = strsrch->textIter;

  4332     int32_t             textlength      = strsrch->search->textLength;

  4333     int32_t            *patternce       = strsrch->pattern.CE;

  4334     int32_t             patterncelength = strsrch->pattern.CELength;

  4335     int32_t             textoffset      = ucol_getOffset(coleiter);

  4337     // status used in setting coleiter offset, since offset is checked in

  4338     // shiftForward before setting the coleiter offset, status never

  4339     // a failure

  4340     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,

  4341                               patterncelength);

  4342     while (textoffset <= textlength)

  4343     {

  4344         uint32_t    patternceindex = patterncelength - 1;

  4345         int32_t     targetce;

  4346         UBool       found          = FALSE;

  4347         int32_t    lastce          = UCOL_NULLORDER;

  4349         setColEIterOffset(coleiter, textoffset);

  4351         for (;;) {

  4352             // finding the last pattern ce match, imagine composite characters

  4353             // for example: search for pattern A in text \u00C0

  4354             // we'll have to skip \u0300 the grave first before we get to A

  4355             targetce = ucol_previous(coleiter, status);

  4356             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4357                 found = FALSE;

  4358                 break;

  4359             }

  4360             targetce = getCE(strsrch, targetce);

  4361             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {

  4362                 // this is for the text \u0315\u0300 that requires

  4363                 // normalization and pattern \u0300, where \u0315 is ignorable

  4364                 continue;

  4365             }

  4366             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {

  4367                 lastce = targetce;

  4368             }

  4369             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4370             if (targetce == patternce[patternceindex]) {

  4371                 // the first ce can be a contraction

  4372                 found = TRUE;

  4373                 break;

  4374             }

  4375             if (!hasExpansion(coleiter)) {

  4376                 found = FALSE;

  4377                 break;

  4378             }

  4379         }

  4381         //targetce = lastce;

  4383         while (found && patternceindex > 0) {

  4384             lastce = targetce;

  4385             targetce    = ucol_previous(coleiter, status);

  4386             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4387                 found = FALSE;

  4388                 break;

  4389             }

  4390             targetce    = getCE(strsrch, targetce);

  4391             if (targetce == UCOL_IGNORABLE) {

  4392                 continue;

  4393             }

  4395             patternceindex --;

  4396             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4397             found = found && targetce == patternce[patternceindex];

  4398         }

  4400         targetce = lastce;

  4402         if (!found) {

  4403             if (U_FAILURE(*status)) {

  4404                 break;

  4405             }

  4406             textoffset = shiftForward(strsrch, textoffset, lastce,

  4407                                       patternceindex);

  4408             // status checked at loop.

  4409             patternceindex = patterncelength;

  4410             continue;

  4411         }

  4413         if (checkNextExactMatch(strsrch, &textoffset, status)) {

  4414             // status checked in ucol_setOffset

  4415             setColEIterOffset(coleiter, strsrch->search->matchedIndex);

  4416             return TRUE;

  4417         }

  4418     }

  4419     setMatchNotFound(strsrch);

  4420     return FALSE;

  4421 #else

  4422     int32_t textOffset = ucol_getOffset(strsrch->textIter);

  4423     int32_t start = -1;

  4424     int32_t end = -1;

  4426     if (usearch_search(strsrch, textOffset, &start, &end, status)) {

  4427         strsrch->search->matchedIndex  = start;

  4428         strsrch->search->matchedLength = end - start;

  4429         return TRUE;

  4430     } else {

  4431         setMatchNotFound(strsrch);

  4432         return FALSE;

  4433     }

  4434 #endif

  4435 }

  4437 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)

  4438 {

  4439     if (U_FAILURE(*status)) {

  4440         setMatchNotFound(strsrch);

  4441         return FALSE;

  4442     }

  4444 #if BOYER_MOORE

  4445     UCollationElements *coleiter        = strsrch->textIter;

  4446     int32_t             textlength      = strsrch->search->textLength;

  4447     int32_t            *patternce       = strsrch->pattern.CE;

  4448     int32_t             patterncelength = strsrch->pattern.CELength;

  4449     int32_t             textoffset      = ucol_getOffset(coleiter);

  4450     UBool               hasPatternAccents =

  4451        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;

  4453     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,

  4454                               patterncelength);

  4455     strsrch->canonicalPrefixAccents[0] = 0;

  4456     strsrch->canonicalSuffixAccents[0] = 0;

  4458     while (textoffset <= textlength)

  4459     {

  4460         int32_t     patternceindex = patterncelength - 1;

  4461         int32_t     targetce;

  4462         UBool       found          = FALSE;

  4463         int32_t     lastce         = UCOL_NULLORDER;

  4465         setColEIterOffset(coleiter, textoffset);

  4467         for (;;) {

  4468             // finding the last pattern ce match, imagine composite characters

  4469             // for example: search for pattern A in text \u00C0

  4470             // we'll have to skip \u0300 the grave first before we get to A

  4471             targetce = ucol_previous(coleiter, status);

  4472             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4473                 found = FALSE;

  4474                 break;

  4475             }

  4476             targetce = getCE(strsrch, targetce);

  4477             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {

  4478                 lastce = targetce;

  4479             }

  4480             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4481             if (targetce == patternce[patternceindex]) {

  4482                 // the first ce can be a contraction

  4483                 found = TRUE;

  4484                 break;

  4485             }

  4486             if (!hasExpansion(coleiter)) {

  4487                 found = FALSE;

  4488                 break;

  4489             }

  4490         }

  4492         while (found && patternceindex > 0) {

  4493             targetce    = ucol_previous(coleiter, status);

  4494             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4495                 found = FALSE;

  4496                 break;

  4497             }

  4498             targetce    = getCE(strsrch, targetce);

  4499             if (targetce == UCOL_IGNORABLE) {

  4500                 continue;

  4501             }

  4503             patternceindex --;

  4504             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4505             found = found && targetce == patternce[patternceindex];

  4506         }

  4508         // initializing the rearranged accent array

  4509         if (hasPatternAccents && !found) {

  4510             strsrch->canonicalPrefixAccents[0] = 0;

  4511             strsrch->canonicalSuffixAccents[0] = 0;

  4512             if (U_FAILURE(*status)) {

  4513                 break;

  4514             }

  4515             found = doNextCanonicalMatch(strsrch, textoffset, status);

  4516         }

  4518         if (!found) {

  4519             if (U_FAILURE(*status)) {

  4520                 break;

  4521             }

  4522             textoffset = shiftForward(strsrch, textoffset, lastce,

  4523                                       patternceindex);

  4524             // status checked at loop

  4525             patternceindex = patterncelength;

  4526             continue;

  4527         }

  4529         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {

  4530             setColEIterOffset(coleiter, strsrch->search->matchedIndex);

  4531             return TRUE;

  4532         }

  4533     }

  4534     setMatchNotFound(strsrch);

  4535     return FALSE;

  4536 #else

  4537     int32_t textOffset = ucol_getOffset(strsrch->textIter);

  4538     int32_t start = -1;

  4539     int32_t end = -1;

  4541     if (usearch_search(strsrch, textOffset, &start, &end, status)) {

  4542         strsrch->search->matchedIndex  = start;

  4543         strsrch->search->matchedLength = end - start;

  4544         return TRUE;

  4545     } else {

  4546         setMatchNotFound(strsrch);

  4547         return FALSE;

  4548     }

  4549 #endif

  4550 }

  4552 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)

  4553 {

  4554     if (U_FAILURE(*status)) {

  4555         setMatchNotFound(strsrch);

  4556         return FALSE;

  4557     }

  4559 #if BOYER_MOORE

  4560     UCollationElements *coleiter        = strsrch->textIter;

  4561     int32_t            *patternce       = strsrch->pattern.CE;

  4562     int32_t             patterncelength = strsrch->pattern.CELength;

  4563     int32_t             textoffset      = ucol_getOffset(coleiter);

  4565     // shifting it check for setting offset

  4566     // if setOffset is called previously or there was no previous match, we

  4567     // leave the offset as it is.

  4568     if (strsrch->search->matchedIndex != USEARCH_DONE) {

  4569         textoffset = strsrch->search->matchedIndex;

  4570     }

  4572     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,

  4573                               patterncelength);

  4575     while (textoffset >= 0)

  4576     {

  4577         int32_t     patternceindex = 1;

  4578         int32_t     targetce;

  4579         UBool       found          = FALSE;

  4580         int32_t     firstce        = UCOL_NULLORDER;

  4582         // if status is a failure, ucol_setOffset does nothing

  4583         setColEIterOffset(coleiter, textoffset);

  4585         for (;;) {

  4586             // finding the first pattern ce match, imagine composite

  4587             // characters. for example: search for pattern \u0300 in text

  4588             // \u00C0, we'll have to skip A first before we get to

  4589             // \u0300 the grave accent

  4590             targetce = ucol_next(coleiter, status);

  4591             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4592                 found = FALSE;

  4593                 break;

  4594             }

  4595             targetce = getCE(strsrch, targetce);

  4596             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {

  4597                 firstce = targetce;

  4598             }

  4599             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {

  4600                 continue;

  4601             }

  4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4603             if (targetce == patternce[0]) {

  4604                 found = TRUE;

  4605                 break;

  4606             }

  4607             if (!hasExpansion(coleiter)) {

  4608                 // checking for accents in composite character

  4609                 found = FALSE;

  4610                 break;

  4611             }

  4612         }

  4614         //targetce = firstce;

  4616         while (found && (patternceindex < patterncelength)) {

  4617             firstce = targetce;

  4618             targetce    = ucol_next(coleiter, status);

  4619             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4620                 found = FALSE;

  4621                 break;

  4622             }

  4623             targetce    = getCE(strsrch, targetce);

  4624             if (targetce == UCOL_IGNORABLE) {

  4625                 continue;

  4626             }

  4628             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4629             found = found && targetce == patternce[patternceindex];

  4630             patternceindex ++;

  4631         }

  4633         targetce = firstce;

  4635         if (!found) {

  4636             if (U_FAILURE(*status)) {

  4637                 break;

  4638             }

  4640             textoffset = reverseShift(strsrch, textoffset, targetce,

  4641                                       patternceindex);

  4642             patternceindex = 0;

  4643             continue;

  4644         }

  4646         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {

  4647             setColEIterOffset(coleiter, textoffset);

  4648             return TRUE;

  4649         }

  4650     }

  4651     setMatchNotFound(strsrch);

  4652     return FALSE;

  4653 #else

  4654     int32_t textOffset;

  4656     if (strsrch->search->isOverlap) {

  4657         if (strsrch->search->matchedIndex != USEARCH_DONE) {

  4658             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;

  4659         } else {

  4660             // move the start position at the end of possible match

  4661             initializePatternPCETable(strsrch, status);

  4662             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {

  4663                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);

  4664                 if (pce == UCOL_PROCESSED_NULLORDER) {

  4665                     // at the end of the text

  4666                     break;

  4667                 }

  4668             }

  4669             if (U_FAILURE(*status)) {

  4670                 setMatchNotFound(strsrch);

  4671                 return FALSE;

  4672             }

  4673             textOffset = ucol_getOffset(strsrch->textIter);

  4674         }

  4675     } else {

  4676         textOffset = ucol_getOffset(strsrch->textIter);

  4677     }

  4679     int32_t start = -1;

  4680     int32_t end = -1;

  4682     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {

  4683         strsrch->search->matchedIndex = start;

  4684         strsrch->search->matchedLength = end - start;

  4685         return TRUE;

  4686     } else {

  4687         setMatchNotFound(strsrch);

  4688         return FALSE;

  4689     }

  4690 #endif

  4691 }

  4693 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,

  4694                                       UErrorCode    *status)

  4695 {

  4696     if (U_FAILURE(*status)) {

  4697         setMatchNotFound(strsrch);

  4698         return FALSE;

  4699     }

  4701 #if BOYER_MOORE

  4702     UCollationElements *coleiter        = strsrch->textIter;

  4703     int32_t            *patternce       = strsrch->pattern.CE;

  4704     int32_t             patterncelength = strsrch->pattern.CELength;

  4705     int32_t             textoffset      = ucol_getOffset(coleiter);

  4706     UBool               hasPatternAccents =

  4707        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;

  4709     // shifting it check for setting offset

  4710     // if setOffset is called previously or there was no previous match, we

  4711     // leave the offset as it is.

  4712     if (strsrch->search->matchedIndex != USEARCH_DONE) {

  4713         textoffset = strsrch->search->matchedIndex;

  4714     }

  4716     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,

  4717                               patterncelength);

  4718     strsrch->canonicalPrefixAccents[0] = 0;

  4719     strsrch->canonicalSuffixAccents[0] = 0;

  4721     while (textoffset >= 0)

  4722     {

  4723         int32_t     patternceindex = 1;

  4724         int32_t     targetce;

  4725         UBool       found          = FALSE;

  4726         int32_t     firstce        = UCOL_NULLORDER;

  4728         setColEIterOffset(coleiter, textoffset);

  4729         for (;;) {

  4730             // finding the first pattern ce match, imagine composite

  4731             // characters. for example: search for pattern \u0300 in text

  4732             // \u00C0, we'll have to skip A first before we get to

  4733             // \u0300 the grave accent

  4734             targetce = ucol_next(coleiter, status);

  4735             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4736                 found = FALSE;

  4737                 break;

  4738             }

  4739             targetce = getCE(strsrch, targetce);

  4740             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {

  4741                 firstce = targetce;

  4742             }

  4744             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4745             if (targetce == patternce[0]) {

  4746                 // the first ce can be a contraction

  4747                 found = TRUE;

  4748                 break;

  4749             }

  4750             if (!hasExpansion(coleiter)) {

  4751                 // checking for accents in composite character

  4752                 found = FALSE;

  4753                 break;

  4754             }

  4755         }

  4757         targetce = firstce;

  4759         while (found && patternceindex < patterncelength) {

  4760             targetce    = ucol_next(coleiter, status);

  4761             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {

  4762                 found = FALSE;

  4763                 break;

  4764             }

  4765             targetce = getCE(strsrch, targetce);

  4766             if (targetce == UCOL_IGNORABLE) {

  4767                 continue;

  4768             }

  4770             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s

  4771             found = found && targetce == patternce[patternceindex];

  4772             patternceindex ++;

  4773         }

  4775         // initializing the rearranged accent array

  4776         if (hasPatternAccents && !found) {

  4777             strsrch->canonicalPrefixAccents[0] = 0;

  4778             strsrch->canonicalSuffixAccents[0] = 0;

  4779             if (U_FAILURE(*status)) {

  4780                 break;

  4781             }

  4782             found = doPreviousCanonicalMatch(strsrch, textoffset, status);

  4783         }

  4785         if (!found) {

  4786             if (U_FAILURE(*status)) {

  4787                 break;

  4788             }

  4789             textoffset = reverseShift(strsrch, textoffset, targetce,

  4790                                       patternceindex);

  4791             patternceindex = 0;

  4792             continue;

  4793         }

  4795         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {

  4796             setColEIterOffset(coleiter, textoffset);

  4797             return TRUE;

  4798         }

  4799     }

  4800     setMatchNotFound(strsrch);

  4801     return FALSE;

  4802 #else

  4803     int32_t textOffset;

  4805     if (strsrch->search->isOverlap) {

  4806         if (strsrch->search->matchedIndex != USEARCH_DONE) {

  4807             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;

  4808         } else {

  4809             // move the start position at the end of possible match

  4810             initializePatternPCETable(strsrch, status);

  4811             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {

  4812                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);

  4813                 if (pce == UCOL_PROCESSED_NULLORDER) {

  4814                     // at the end of the text

  4815                     break;

  4816                 }

  4817             }

  4818             if (U_FAILURE(*status)) {

  4819                 setMatchNotFound(strsrch);

  4820                 return FALSE;

  4821             }

  4822             textOffset = ucol_getOffset(strsrch->textIter);

  4823         }

  4824     } else {

  4825         textOffset = ucol_getOffset(strsrch->textIter);

  4826     }

  4828     int32_t start = -1;

  4829     int32_t end = -1;

  4831     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {

  4832         strsrch->search->matchedIndex = start;

  4833         strsrch->search->matchedLength = end - start;

  4834         return TRUE;

  4835     } else {

  4836         setMatchNotFound(strsrch);

  4837         return FALSE;

  4838     }

  4839 #endif

  4840 }

  4842 #endif /* #if !UCONFIG_NO_COLLATION */

The Tor Browser / file revision

intl/icu/source/i18n/usearch.cpp@6474c204b198

intl/icu/source/i18n/usearch.cpp