intl/icu/source/i18n/usearch.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
     4 **********************************************************************
     5 *   Date        Name        Description
     6 *  07/02/2001   synwee      Creation.
     7 **********************************************************************
     8 */
    10 #include "unicode/utypes.h"
    12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
    14 #include "unicode/usearch.h"
    15 #include "unicode/ustring.h"
    16 #include "unicode/uchar.h"
    17 #include "unicode/utf16.h"
    18 #include "normalizer2impl.h"
    19 #include "ucol_imp.h"
    20 #include "usrchimp.h"
    21 #include "cmemory.h"
    22 #include "ucln_in.h"
    23 #include "uassert.h"
    24 #include "ustr_imp.h"
    26 U_NAMESPACE_USE
    28 // don't use Boyer-Moore
    29 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
    30 #define BOYER_MOORE 0
    32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    34 // internal definition ---------------------------------------------------
    36 #define LAST_BYTE_MASK_          0xFF
    37 #define SECOND_LAST_BYTE_SHIFT_  8
    38 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
    40 static const Normalizer2Impl *g_nfcImpl = NULL;
    42 // internal methods -------------------------------------------------
    44 /**
    45 * Fast collation element iterator setOffset.
    46 * This function does not check for bounds.
    47 * @param coleiter collation element iterator
    48 * @param offset to set
    49 */
    50 static
    51 inline void setColEIterOffset(UCollationElements *elems,
    52                       int32_t             offset)
    53 {
    54     collIterate *ci = &(elems->iteratordata_);
    55     ci->pos         = ci->string + offset;
    56     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
    57     if (ci->flags & UCOL_ITER_INNORMBUF) {
    58         ci->flags = ci->origFlags;
    59     }
    60     ci->fcdPosition = NULL;
    62     ci->offsetReturn = NULL;
    63     ci->offsetStore  = ci->offsetBuffer;
    64     ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
    65 }
    67 /**
    68 * Getting the mask for collation strength
    69 * @param strength collation strength
    70 * @return collation element mask
    71 */
    72 static
    73 inline uint32_t getMask(UCollationStrength strength)
    74 {
    75     switch (strength)
    76     {
    77     case UCOL_PRIMARY:
    78         return UCOL_PRIMARYORDERMASK;
    79     case UCOL_SECONDARY:
    80         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
    81     default:
    82         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
    83                UCOL_PRIMARYORDERMASK;
    84     }
    85 }
    87 /**
    88 * This is to squeeze the 21bit ces into a 256 table
    89 * @param ce collation element
    90 * @return collapsed version of the collation element
    91 */
    92 static
    93 inline int hash(uint32_t ce)
    94 {
    95     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
    96     // well with the new collation where most of the latin 1 characters
    97     // are of the value xx000xxx. their hashes will most of the time be 0
    98     // to be discussed on the hash algo.
    99     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
   100 }
   102 U_CDECL_BEGIN
   103 static UBool U_CALLCONV
   104 usearch_cleanup(void) {
   105     g_nfcImpl = NULL;
   106     return TRUE;
   107 }
   108 U_CDECL_END
   110 /**
   111 * Initializing the fcd tables.
   112 * Internal method, status assumed to be a success.
   113 * @param status output error if any, caller to check status before calling
   114 *               method, status assumed to be success when passed in.
   115 */
   116 static
   117 inline void initializeFCD(UErrorCode *status)
   118 {
   119     if (g_nfcImpl == NULL) {
   120         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
   121         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
   122     }
   123 }
   125 /**
   126 * Gets the fcd value for a character at the argument index.
   127 * This method takes into accounts of the supplementary characters.
   128 * @param str UTF16 string where character for fcd retrieval resides
   129 * @param offset position of the character whose fcd is to be retrieved, to be
   130 *               overwritten with the next character position, taking
   131 *               surrogate characters into consideration.
   132 * @param strlength length of the argument string
   133 * @return fcd value
   134 */
   135 static
   136 uint16_t getFCD(const UChar   *str, int32_t *offset,
   137                              int32_t  strlength)
   138 {
   139     const UChar *temp = str + *offset;
   140     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
   141     *offset = (int32_t)(temp - str);
   142     return result;
   143 }
   145 /**
   146 * Getting the modified collation elements taking into account the collation
   147 * attributes
   148 * @param strsrch string search data
   149 * @param sourcece
   150 * @return the modified collation element
   151 */
   152 static
   153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
   154 {
   155     // note for tertiary we can't use the collator->tertiaryMask, that
   156     // is a preprocessed mask that takes into account case options. since
   157     // we are only concerned with exact matches, we don't need that.
   158     sourcece &= strsrch->ceMask;
   160     if (strsrch->toShift) {
   161         // alternate handling here, since only the 16 most significant digits
   162         // is only used, we can safely do a compare without masking
   163         // if the ce is a variable, we mask and get only the primary values
   164         // no shifting to quartenary is required since all primary values
   165         // less than variabletop will need to be masked off anyway.
   166         if (strsrch->variableTop > sourcece) {
   167             if (strsrch->strength >= UCOL_QUATERNARY) {
   168                 sourcece &= UCOL_PRIMARYORDERMASK;
   169             }
   170             else {
   171                 sourcece = UCOL_IGNORABLE;
   172             }
   173         }
   174     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
   175         sourcece = 0xFFFF;
   176     }
   178     return sourcece;
   179 }
   181 /**
   182 * Allocate a memory and returns NULL if it failed.
   183 * Internal method, status assumed to be a success.
   184 * @param size to allocate
   185 * @param status output error if any, caller to check status before calling
   186 *               method, status assumed to be success when passed in.
   187 * @return newly allocated array, NULL otherwise
   188 */
   189 static
   190 inline void * allocateMemory(uint32_t size, UErrorCode *status)
   191 {
   192     uint32_t *result = (uint32_t *)uprv_malloc(size);
   193     if (result == NULL) {
   194         *status = U_MEMORY_ALLOCATION_ERROR;
   195     }
   196     return result;
   197 }
   199 /**
   200 * Adds a uint32_t value to a destination array.
   201 * Creates a new array if we run out of space. The caller will have to
   202 * manually deallocate the newly allocated array.
   203 * Internal method, status assumed to be success, caller has to check status
   204 * before calling this method. destination not to be NULL and has at least
   205 * size destinationlength.
   206 * @param destination target array
   207 * @param offset destination offset to add value
   208 * @param destinationlength target array size, return value for the new size
   209 * @param value to be added
   210 * @param increments incremental size expected
   211 * @param status output error if any, caller to check status before calling
   212 *               method, status assumed to be success when passed in.
   213 * @return new destination array, destination if there was no new allocation
   214 */
   215 static
   216 inline int32_t * addTouint32_tArray(int32_t    *destination,
   217                                     uint32_t    offset,
   218                                     uint32_t   *destinationlength,
   219                                     uint32_t    value,
   220                                     uint32_t    increments,
   221                                     UErrorCode *status)
   222 {
   223     uint32_t newlength = *destinationlength;
   224     if (offset + 1 == newlength) {
   225         newlength += increments;
   226         int32_t *temp = (int32_t *)allocateMemory(
   227                                          sizeof(int32_t) * newlength, status);
   228         if (U_FAILURE(*status)) {
   229             return NULL;
   230         }
   231         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
   232         *destinationlength = newlength;
   233         destination        = temp;
   234     }
   235     destination[offset] = value;
   236     return destination;
   237 }
   239 /**
   240 * Adds a uint64_t value to a destination array.
   241 * Creates a new array if we run out of space. The caller will have to
   242 * manually deallocate the newly allocated array.
   243 * Internal method, status assumed to be success, caller has to check status
   244 * before calling this method. destination not to be NULL and has at least
   245 * size destinationlength.
   246 * @param destination target array
   247 * @param offset destination offset to add value
   248 * @param destinationlength target array size, return value for the new size
   249 * @param value to be added
   250 * @param increments incremental size expected
   251 * @param status output error if any, caller to check status before calling
   252 *               method, status assumed to be success when passed in.
   253 * @return new destination array, destination if there was no new allocation
   254 */
   255 static
   256 inline int64_t * addTouint64_tArray(int64_t    *destination,
   257                                     uint32_t    offset,
   258                                     uint32_t   *destinationlength,
   259                                     uint64_t    value,
   260                                     uint32_t    increments,
   261                                     UErrorCode *status)
   262 {
   263     uint32_t newlength = *destinationlength;
   264     if (offset + 1 == newlength) {
   265         newlength += increments;
   266         int64_t *temp = (int64_t *)allocateMemory(
   267                                          sizeof(int64_t) * newlength, status);
   269         if (U_FAILURE(*status)) {
   270             return NULL;
   271         }
   273         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
   274         *destinationlength = newlength;
   275         destination        = temp;
   276     }
   278     destination[offset] = value;
   280     return destination;
   281 }
   283 /**
   284 * Initializing the ce table for a pattern.
   285 * Stores non-ignorable collation keys.
   286 * Table size will be estimated by the size of the pattern text. Table
   287 * expansion will be perform as we go along. Adding 1 to ensure that the table
   288 * size definitely increases.
   289 * Internal method, status assumed to be a success.
   290 * @param strsrch string search data
   291 * @param status output error if any, caller to check status before calling
   292 *               method, status assumed to be success when passed in.
   293 * @return total number of expansions
   294 */
   295 static
   296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
   297                                          UErrorCode    *status)
   298 {
   299     UPattern *pattern            = &(strsrch->pattern);
   300     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
   301     int32_t  *cetable            = pattern->CEBuffer;
   302     uint32_t  patternlength      = pattern->textLength;
   303     UCollationElements *coleiter = strsrch->utilIter;
   305     if (coleiter == NULL) {
   306         coleiter = ucol_openElements(strsrch->collator, pattern->text,
   307                                      patternlength, status);
   308         // status will be checked in ucol_next(..) later and if it is an
   309         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
   310         // returned.
   311         strsrch->utilIter = coleiter;
   312     }
   313     else {
   314         uprv_init_collIterate(strsrch->collator, pattern->text,
   315                          pattern->textLength,
   316                          &coleiter->iteratordata_,
   317                          status);
   318     }
   319     if(U_FAILURE(*status)) {
   320         return 0;
   321     }
   323     if (pattern->CE != cetable && pattern->CE) {
   324         uprv_free(pattern->CE);
   325     }
   327     uint16_t  offset      = 0;
   328     uint16_t  result      = 0;
   329     int32_t   ce;
   331     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
   332            U_SUCCESS(*status)) {
   333         uint32_t newce = getCE(strsrch, ce);
   334         if (newce) {
   335             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
   336                                   newce,
   337                                   patternlength - ucol_getOffset(coleiter) + 1,
   338                                   status);
   339             if (U_FAILURE(*status)) {
   340                 return 0;
   341             }
   342             offset ++;
   343             if (cetable != temp && cetable != pattern->CEBuffer) {
   344                 uprv_free(cetable);
   345             }
   346             cetable = temp;
   347         }
   348         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
   349     }
   351     cetable[offset]   = 0;
   352     pattern->CE       = cetable;
   353     pattern->CELength = offset;
   355     return result;
   356 }
   358 /**
   359 * Initializing the pce table for a pattern.
   360 * Stores non-ignorable collation keys.
   361 * Table size will be estimated by the size of the pattern text. Table
   362 * expansion will be perform as we go along. Adding 1 to ensure that the table
   363 * size definitely increases.
   364 * Internal method, status assumed to be a success.
   365 * @param strsrch string search data
   366 * @param status output error if any, caller to check status before calling
   367 *               method, status assumed to be success when passed in.
   368 * @return total number of expansions
   369 */
   370 static
   371 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
   372                                           UErrorCode    *status)
   373 {
   374     UPattern *pattern            = &(strsrch->pattern);
   375     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
   376     int64_t  *pcetable           = pattern->PCEBuffer;
   377     uint32_t  patternlength      = pattern->textLength;
   378     UCollationElements *coleiter = strsrch->utilIter;
   380     if (coleiter == NULL) {
   381         coleiter = ucol_openElements(strsrch->collator, pattern->text,
   382                                      patternlength, status);
   383         // status will be checked in ucol_next(..) later and if it is an
   384         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
   385         // returned.
   386         strsrch->utilIter = coleiter;
   387     } else {
   388         uprv_init_collIterate(strsrch->collator, pattern->text,
   389                               pattern->textLength,
   390                               &coleiter->iteratordata_,
   391                               status);
   392     }
   393     if(U_FAILURE(*status)) {
   394         return 0;
   395     }
   397     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
   398         uprv_free(pattern->PCE);
   399     }
   401     uint16_t  offset = 0;
   402     uint16_t  result = 0;
   403     int64_t   pce;
   405     uprv_init_pce(coleiter);
   407     // ** Should processed CEs be signed or unsigned?
   408     // ** (the rest of the code in this file seems to play fast-and-loose with
   409     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
   410     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
   411            U_SUCCESS(*status)) {
   412         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
   413                               pce,
   414                               patternlength - ucol_getOffset(coleiter) + 1,
   415                               status);
   417         if (U_FAILURE(*status)) {
   418             return 0;
   419         }
   421         offset += 1;
   423         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
   424             uprv_free(pcetable);
   425         }
   427         pcetable = temp;
   428         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
   429     }
   431     pcetable[offset]   = 0;
   432     pattern->PCE       = pcetable;
   433     pattern->PCELength = offset;
   435     return result;
   436 }
   438 /**
   439 * Initializes the pattern struct.
   440 * Internal method, status assumed to be success.
   441 * @param strsrch UStringSearch data storage
   442 * @param status output error if any, caller to check status before calling
   443 *               method, status assumed to be success when passed in.
   444 * @return expansionsize the total expansion size of the pattern
   445 */
   446 static
   447 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
   448 {
   449           UPattern   *pattern     = &(strsrch->pattern);
   450     const UChar      *patterntext = pattern->text;
   451           int32_t     length      = pattern->textLength;
   452           int32_t index       = 0;
   454     // Since the strength is primary, accents are ignored in the pattern.
   455     if (strsrch->strength == UCOL_PRIMARY) {
   456         pattern->hasPrefixAccents = 0;
   457         pattern->hasSuffixAccents = 0;
   458     } else {
   459         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
   460                                                          SECOND_LAST_BYTE_SHIFT_;
   461         index = length;
   462         U16_BACK_1(patterntext, 0, index);
   463         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
   464                                                                  LAST_BYTE_MASK_;
   465     }
   467     // ** HACK **
   468     if (strsrch->pattern.PCE != NULL) {
   469         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
   470             uprv_free(strsrch->pattern.PCE);
   471         }
   473         strsrch->pattern.PCE = NULL;
   474     }
   476     // since intializePattern is an internal method status is a success.
   477     return initializePatternCETable(strsrch, status);
   478 }
   480 /**
   481 * Initializing shift tables, with the default values.
   482 * If a corresponding default value is 0, the shift table is not set.
   483 * @param shift table for forwards shift
   484 * @param backshift table for backwards shift
   485 * @param cetable table containing pattern ce
   486 * @param cesize size of the pattern ces
   487 * @param expansionsize total size of the expansions
   488 * @param defaultforward the default forward value
   489 * @param defaultbackward the default backward value
   490 */
   491 static
   492 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
   493                           int32_t  *cetable, int32_t cesize,
   494                           int16_t   expansionsize,
   495                           int16_t   defaultforward,
   496                           int16_t   defaultbackward)
   497 {
   498     // estimate the value to shift. to do that we estimate the smallest
   499     // number of characters to give the relevant ces, ie approximately
   500     // the number of ces minus their expansion, since expansions can come
   501     // from a character.
   502     int32_t count;
   503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
   504         shift[count] = defaultforward;
   505     }
   506     cesize --; // down to the last index
   507     for (count = 0; count < cesize; count ++) {
   508         // number of ces from right of array to the count
   509         int temp = defaultforward - count - 1;
   510         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
   511     }
   512     shift[hash(cetable[cesize])] = 1;
   513     // for ignorables we just shift by one. see test examples.
   514     shift[hash(0)] = 1;
   516     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
   517         backshift[count] = defaultbackward;
   518     }
   519     for (count = cesize; count > 0; count --) {
   520         // the original value count does not seem to work
   521         backshift[hash(cetable[count])] = count > expansionsize ?
   522                                           (int16_t)(count - expansionsize) : 1;
   523     }
   524     backshift[hash(cetable[0])] = 1;
   525     backshift[hash(0)] = 1;
   526 }
   528 /**
   529 * Building of the pattern collation element list and the boyer moore strsrch
   530 * table.
   531 * The canonical match will only be performed after the default match fails.
   532 * For both cases we need to remember the size of the composed and decomposed
   533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
   534 * a number of characters in the text and tries to match the pattern from that
   535 * offset, the shift value can not be too large in case we miss some
   536 * characters. To choose a right shift size, we estimate the NFC form of the
   537 * and use its size as a shift guide. The NFC form should be the small
   538 * possible representation of the pattern. Anyways, we'll err on the smaller
   539 * shift size. Hence the calculation for minlength.
   540 * Canonical match will be performed slightly differently. We'll split the
   541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
   542 * the first and last base character (MS), the ending accents (EA). Matches
   543 * will be done on MS first, and only when we match MS then some processing
   544 * will be required for the prefix and end accents in order to determine if
   545 * they match PA and EA. Hence the default shift values
   546 * for the canonical match will take the size of either end's accent into
   547 * consideration. Forwards search will take the end accents into consideration
   548 * for the default shift values and the backwards search will take the prefix
   549 * accents into consideration.
   550 * If pattern has no non-ignorable ce, we return a illegal argument error.
   551 * Internal method, status assumed to be success.
   552 * @param strsrch UStringSearch data storage
   553 * @param status  for output errors if it occurs, status is assumed to be a
   554 *                success when it is passed in.
   555 */
   556 static
   557 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
   558 {
   559     int16_t expandlength  = initializePattern(strsrch, status);
   560     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
   561         UPattern *pattern = &strsrch->pattern;
   562         int32_t   cesize  = pattern->CELength;
   564         int16_t minlength = cesize > expandlength
   565                             ? (int16_t)cesize - expandlength : 1;
   566         pattern->defaultShiftSize    = minlength;
   567         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
   568                       cesize, expandlength, minlength, minlength);
   569         return;
   570     }
   571     strsrch->pattern.defaultShiftSize = 0;
   572 }
   574 #if BOYER_MOORE
   575 /**
   576 * Check to make sure that the match length is at the end of the character by
   577 * using the breakiterator.
   578 * @param strsrch string search data
   579 * @param start target text start offset
   580 * @param end target text end offset
   581 */
   582 static
   583 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
   584                                int32_t *end)
   585 {
   586 #if !UCONFIG_NO_BREAK_ITERATION
   587     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
   588     if (breakiterator) {
   589         int32_t matchend = *end;
   590         //int32_t matchstart = *start;
   592         if (!ubrk_isBoundary(breakiterator, matchend)) {
   593             *end = ubrk_following(breakiterator, matchend);
   594         }
   596         /* Check the start of the matched text to make sure it doesn't have any accents
   597          * before it.  This code may not be necessary and so it is commented out */
   598         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
   599             *start = ubrk_preceding(breakiterator, matchstart);
   600         }*/
   601     }
   602 #endif
   603 }
   605 /**
   606 * Determine whether the target text in UStringSearch bounded by the offset
   607 * start and end is one or more whole units of text as
   608 * determined by the breakiterator in UStringSearch.
   609 * @param strsrch string search data
   610 * @param start target text start offset
   611 * @param end target text end offset
   612 */
   613 static
   614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
   615                                int32_t    end)
   616 {
   617 #if !UCONFIG_NO_BREAK_ITERATION
   618     UBreakIterator *breakiterator = strsrch->search->breakIter;
   619     //TODO: Add here.
   620     if (breakiterator) {
   621         int32_t startindex = ubrk_first(breakiterator);
   622         int32_t endindex   = ubrk_last(breakiterator);
   624         // out-of-range indexes are never boundary positions
   625         if (start < startindex || start > endindex ||
   626             end < startindex || end > endindex) {
   627             return FALSE;
   628         }
   629         // otherwise, we can use following() on the position before the
   630         // specified one and return true of the position we get back is the
   631         // one the user specified
   632         UBool result = (start == startindex ||
   633                 ubrk_following(breakiterator, start - 1) == start) &&
   634                (end == endindex ||
   635                 ubrk_following(breakiterator, end - 1) == end);
   636         if (result) {
   637             // iterates the individual ces
   638                   UCollationElements *coleiter  = strsrch->utilIter;
   639             const UChar              *text      = strsrch->search->text +
   640                                                                       start;
   641                   UErrorCode          status    = U_ZERO_ERROR;
   642             ucol_setText(coleiter, text, end - start, &status);
   643             for (int32_t count = 0; count < strsrch->pattern.CELength;
   644                  count ++) {
   645                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   646                 if (ce == UCOL_IGNORABLE) {
   647                     count --;
   648                     continue;
   649                 }
   650                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
   651                     return FALSE;
   652                 }
   653             }
   654             int32_t nextce = ucol_next(coleiter, &status);
   655             while (ucol_getOffset(coleiter) == (end - start)
   656                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
   657                 nextce = ucol_next(coleiter, &status);
   658             }
   659             if (ucol_getOffset(coleiter) == (end - start)
   660                 && nextce != UCOL_NULLORDER) {
   661                 // extra collation elements at the end of the match
   662                 return FALSE;
   663             }
   664         }
   665         return result;
   666     }
   667 #endif
   668     return TRUE;
   669 }
   671 /**
   672 * Getting the next base character offset if current offset is an accent,
   673 * or the current offset if the current character contains a base character.
   674 * accents the following base character will be returned
   675 * @param text string
   676 * @param textoffset current offset
   677 * @param textlength length of text string
   678 * @return the next base character or the current offset
   679 *         if the current character is contains a base character.
   680 */
   681 static
   682 inline int32_t getNextBaseOffset(const UChar       *text,
   683                                            int32_t  textoffset,
   684                                            int32_t      textlength)
   685 {
   686     if (textoffset < textlength) {
   687         int32_t temp = textoffset;
   688         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   689             while (temp < textlength) {
   690                 int32_t result = temp;
   691                 if ((getFCD(text, &temp, textlength) >>
   692                      SECOND_LAST_BYTE_SHIFT_) == 0) {
   693                     return result;
   694                 }
   695             }
   696             return textlength;
   697         }
   698     }
   699     return textoffset;
   700 }
   702 /**
   703 * Gets the next base character offset depending on the string search pattern
   704 * data
   705 * @param strsrch string search data
   706 * @param textoffset current offset, one offset away from the last character
   707 *                   to search for.
   708 * @return start index of the next base character or the current offset
   709 *         if the current character is contains a base character.
   710 */
   711 static
   712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
   713                                                   int32_t    textoffset)
   714 {
   715     int32_t textlength = strsrch->search->textLength;
   716     if (strsrch->pattern.hasSuffixAccents &&
   717         textoffset < textlength) {
   718               int32_t  temp       = textoffset;
   719         const UChar       *text       = strsrch->search->text;
   720         U16_BACK_1(text, 0, temp);
   721         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   722             return getNextBaseOffset(text, textoffset, textlength);
   723         }
   724     }
   725     return textoffset;
   726 }
   728 /**
   729 * Shifting the collation element iterator position forward to prepare for
   730 * a following match. If the last character is a unsafe character, we'll only
   731 * shift by 1 to capture contractions, normalization etc.
   732 * Internal method, status assumed to be success.
   733 * @param text strsrch string search data
   734 * @param textoffset start text position to do search
   735 * @param ce the text ce which failed the match.
   736 * @param patternceindex index of the ce within the pattern ce buffer which
   737 *        failed the match
   738 * @return final offset
   739 */
   740 static
   741 inline int32_t shiftForward(UStringSearch *strsrch,
   742                                 int32_t    textoffset,
   743                                 int32_t       ce,
   744                                 int32_t        patternceindex)
   745 {
   746     UPattern *pattern = &(strsrch->pattern);
   747     if (ce != UCOL_NULLORDER) {
   748         int32_t shift = pattern->shift[hash(ce)];
   749         // this is to adjust for characters in the middle of the
   750         // substring for matching that failed.
   751         int32_t adjust = pattern->CELength - patternceindex;
   752         if (adjust > 1 && shift >= adjust) {
   753             shift -= adjust - 1;
   754         }
   755         textoffset += shift;
   756     }
   757     else {
   758         textoffset += pattern->defaultShiftSize;
   759     }
   761     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
   762     // check for unsafe characters
   763     // * if it is the start or middle of a contraction: to be done after
   764     //   a initial match is found
   765     // * thai or lao base consonant character: similar to contraction
   766     // * high surrogate character: similar to contraction
   767     // * next character is a accent: shift to the next base character
   768     return textoffset;
   769 }
   770 #endif // #if BOYER_MOORE
   772 /**
   773 * sets match not found
   774 * @param strsrch string search data
   775 */
   776 static
   777 inline void setMatchNotFound(UStringSearch *strsrch)
   778 {
   779     // this method resets the match result regardless of the error status.
   780     strsrch->search->matchedIndex = USEARCH_DONE;
   781     strsrch->search->matchedLength = 0;
   782     if (strsrch->search->isForwardSearching) {
   783         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
   784     }
   785     else {
   786         setColEIterOffset(strsrch->textIter, 0);
   787     }
   788 }
   790 #if BOYER_MOORE
   791 /**
   792 * Gets the offset to the next safe point in text.
   793 * ie. not the middle of a contraction, swappable characters or supplementary
   794 * characters.
   795 * @param collator collation sata
   796 * @param text string to work with
   797 * @param textoffset offset in string
   798 * @param textlength length of text string
   799 * @return offset to the next safe character
   800 */
   801 static
   802 inline int32_t getNextSafeOffset(const UCollator   *collator,
   803                                      const UChar       *text,
   804                                            int32_t  textoffset,
   805                                            int32_t      textlength)
   806 {
   807     int32_t result = textoffset; // first contraction character
   808     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
   809         result ++;
   810     }
   811     return result;
   812 }
   814 /**
   815 * This checks for accents in the potential match started with a .
   816 * composite character.
   817 * This is really painful... we have to check that composite character do not
   818 * have any extra accents. We have to normalize the potential match and find
   819 * the immediate decomposed character before the match.
   820 * The first composite character would have been taken care of by the fcd
   821 * checks in checkForwardExactMatch.
   822 * This is the slow path after the fcd of the first character and
   823 * the last character has been checked by checkForwardExactMatch and we
   824 * determine that the potential match has extra non-ignorable preceding
   825 * ces.
   826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
   827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
   828 * Note here that accents checking are slow and cautioned in the API docs.
   829 * Internal method, status assumed to be a success, caller should check status
   830 * before calling this method
   831 * @param strsrch string search data
   832 * @param start index of the potential unfriendly composite character
   833 * @param end index of the potential unfriendly composite character
   834 * @param status output error status if any.
   835 * @return TRUE if there is non-ignorable accents before at the beginning
   836 *              of the match, FALSE otherwise.
   837 */
   839 static
   840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
   841                                    int32_t    end,
   842                                    UErrorCode    *status)
   843 {
   844     UBool result = FALSE;
   845     if (strsrch->pattern.hasPrefixAccents) {
   846               int32_t  length = end - start;
   847               int32_t  offset = 0;
   848         const UChar       *text   = strsrch->search->text + start;
   850         U16_FWD_1(text, offset, length);
   851         // we are only concerned with the first composite character
   852         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
   853             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
   854                                                        text, 0, length);
   855             if (safeoffset != length) {
   856                 safeoffset ++;
   857             }
   858             UChar   *norm = NULL;
   859             UChar    buffer[INITIAL_ARRAY_SIZE_];
   860             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
   861                                             buffer, INITIAL_ARRAY_SIZE_,
   862                                             status);
   863             if (U_FAILURE(*status)) {
   864                 return FALSE;
   865             }
   866             if (size >= INITIAL_ARRAY_SIZE_) {
   867                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
   868                                                status);
   869                 // if allocation failed, status will be set to
   870                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
   871                 // checks for it.
   872                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
   873                                        size, status);
   874                 if (U_FAILURE(*status) && norm != NULL) {
   875                     uprv_free(norm);
   876                     return FALSE;
   877                 }
   878             }
   879             else {
   880                 norm = buffer;
   881             }
   883             UCollationElements *coleiter  = strsrch->utilIter;
   884             ucol_setText(coleiter, norm, size, status);
   885             uint32_t            firstce   = strsrch->pattern.CE[0];
   886             UBool               ignorable = TRUE;
   887             uint32_t            ce        = UCOL_IGNORABLE;
   888             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
   889                 offset = ucol_getOffset(coleiter);
   890                 if (ce != firstce && ce != UCOL_IGNORABLE) {
   891                     ignorable = FALSE;
   892                 }
   893                 ce = ucol_next(coleiter, status);
   894             }
   895             UChar32 codepoint;
   896             U16_PREV(norm, 0, offset, codepoint);
   897             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
   899             if (norm != buffer) {
   900                 uprv_free(norm);
   901             }
   902         }
   903     }
   905     return result;
   906 }
   908 /**
   909 * Used by exact matches, checks if there are accents before the match.
   910 * This is really painful... we have to check that composite characters at
   911 * the start of the matches have to not have any extra accents.
   912 * We check the FCD of the character first, if it starts with an accent and
   913 * the first pattern ce does not match the first ce of the character, we bail.
   914 * Otherwise we try normalizing the first composite
   915 * character and find the immediate decomposed character before the match to
   916 * see if it is an non-ignorable accent.
   917 * Now normalizing the first composite character is enough because we ensure
   918 * that when the match is passed in here with extra beginning ces, the
   919 * first or last ce that match has to occur within the first character.
   920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
   921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
   922 * Note here that accents checking are slow and cautioned in the API docs.
   923 * @param strsrch string search data
   924 * @param start offset
   925 * @param end offset
   926 * @return TRUE if there are accents on either side of the match,
   927 *         FALSE otherwise
   928 */
   929 static
   930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
   931                                   int32_t    end)
   932 {
   933     if (strsrch->pattern.hasPrefixAccents) {
   934         UCollationElements *coleiter  = strsrch->textIter;
   935         UErrorCode          status    = U_ZERO_ERROR;
   936         // we have been iterating forwards previously
   937         uint32_t            ignorable = TRUE;
   938         int32_t             firstce   = strsrch->pattern.CE[0];
   940         setColEIterOffset(coleiter, start);
   941         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
   942         if (U_FAILURE(status)) {
   943             return TRUE;
   944         }
   945         while (ce != firstce) {
   946             if (ce != UCOL_IGNORABLE) {
   947                 ignorable = FALSE;
   948             }
   949             ce = getCE(strsrch, ucol_next(coleiter, &status));
   950             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   951                 return TRUE;
   952             }
   953         }
   954         if (!ignorable && inNormBuf(coleiter)) {
   955             // within normalization buffer, discontiguous handled here
   956             return TRUE;
   957         }
   959         // within text
   960         int32_t temp = start;
   961         // original code
   962         // accent = (getFCD(strsrch->search->text, &temp,
   963         //                  strsrch->search->textLength)
   964         //            >> SECOND_LAST_BYTE_SHIFT_);
   965         // however this code does not work well with VC7 .net in release mode.
   966         // maybe the inlines for getFCD combined with shifting has bugs in
   967         // VC7. anyways this is a work around.
   968         UBool accent = getFCD(strsrch->search->text, &temp,
   969                               strsrch->search->textLength) > 0xFF;
   970         if (!accent) {
   971             return checkExtraMatchAccents(strsrch, start, end, &status);
   972         }
   973         if (!ignorable) {
   974             return TRUE;
   975         }
   976         if (start > 0) {
   977             temp = start;
   978             U16_BACK_1(strsrch->search->text, 0, temp);
   979             if (getFCD(strsrch->search->text, &temp,
   980                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
   981                 setColEIterOffset(coleiter, start);
   982                 ce = ucol_previous(coleiter, &status);
   983                 if (U_FAILURE(status) ||
   984                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
   985                     return TRUE;
   986                 }
   987             }
   988         }
   989     }
   991     return FALSE;
   992 }
   994 /**
   995 * Used by exact matches, checks if there are accents bounding the match.
   996 * Note this is the initial boundary check. If the potential match
   997 * starts or ends with composite characters, the accents in those
   998 * characters will be determined later.
   999 * Not doing backwards iteration here, since discontiguos contraction for
  1000 * backwards collation element iterator, use up too many characters.
  1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
  1002 * should fail since there is a acute at the end of \u01FA
  1003 * Note here that accents checking are slow and cautioned in the API docs.
  1004 * @param strsrch string search data
  1005 * @param start offset of match
  1006 * @param end end offset of the match
  1007 * @return TRUE if there are accents on either side of the match,
  1008 *         FALSE otherwise
  1009 */
  1010 static
  1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
  1012                                  int32_t    end)
  1014     if (strsrch->pattern.hasSuffixAccents) {
  1015         const UChar       *text       = strsrch->search->text;
  1016               int32_t  temp       = end;
  1017               int32_t      textlength = strsrch->search->textLength;
  1018         U16_BACK_1(text, 0, temp);
  1019         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
  1020             int32_t             firstce  = strsrch->pattern.CE[0];
  1021             UCollationElements *coleiter = strsrch->textIter;
  1022             UErrorCode          status   = U_ZERO_ERROR;
  1023             int32_t ce;
  1024             setColEIterOffset(coleiter, start);
  1025             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
  1026                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
  1027                     return TRUE;
  1030             int32_t count = 1;
  1031             while (count < strsrch->pattern.CELength) {
  1032                 if (getCE(strsrch, ucol_next(coleiter, &status))
  1033                     == UCOL_IGNORABLE) {
  1034                     // Thai can give an ignorable here.
  1035                     count --;
  1037                 if (U_FAILURE(status)) {
  1038                     return TRUE;
  1040                 count ++;
  1043             ce = ucol_next(coleiter, &status);
  1044             if (U_FAILURE(status)) {
  1045                 return TRUE;
  1047             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
  1048                 ce = getCE(strsrch, ce);
  1050             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
  1051                 if (ucol_getOffset(coleiter) <= end) {
  1052                     return TRUE;
  1054                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
  1055                     return TRUE;
  1060     return FALSE;
  1062 #endif // #if BOYER_MOORE
  1064 /**
  1065 * Checks if the offset runs out of the text string
  1066 * @param offset
  1067 * @param textlength of the text string
  1068 * @return TRUE if offset is out of bounds, FALSE otherwise
  1069 */
  1070 static
  1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
  1073     return offset < 0 || offset > textlength;
  1076 /**
  1077 * Checks for identical match
  1078 * @param strsrch string search data
  1079 * @param start offset of possible match
  1080 * @param end offset of possible match
  1081 * @return TRUE if identical match is found
  1082 */
  1083 static
  1084 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
  1085                                   int32_t    end)
  1087     if (strsrch->strength != UCOL_IDENTICAL) {
  1088         return TRUE;
  1091     // Note: We could use Normalizer::compare() or similar, but for short strings
  1092     // which may not be in FCD it might be faster to just NFD them.
  1093     UErrorCode status = U_ZERO_ERROR;
  1094     UnicodeString t2, p2;
  1095     strsrch->nfd->normalize(
  1096         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
  1097     strsrch->nfd->normalize(
  1098         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
  1099     // return FALSE if NFD failed
  1100     return U_SUCCESS(status) && t2 == p2;
  1103 #if BOYER_MOORE
  1104 /**
  1105 * Checks to see if the match is repeated
  1106 * @param strsrch string search data
  1107 * @param start new match start index
  1108 * @param end new match end index
  1109 * @return TRUE if the the match is repeated, FALSE otherwise
  1110 */
  1111 static
  1112 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
  1113                                 int32_t    start,
  1114                                 int32_t    end)
  1116     int32_t lastmatchindex = strsrch->search->matchedIndex;
  1117     UBool       result;
  1118     if (lastmatchindex == USEARCH_DONE) {
  1119         return FALSE;
  1121     if (strsrch->search->isForwardSearching) {
  1122         result = start <= lastmatchindex;
  1124     else {
  1125         result = start >= lastmatchindex;
  1127     if (!result && !strsrch->search->isOverlap) {
  1128         if (strsrch->search->isForwardSearching) {
  1129             result = start < lastmatchindex + strsrch->search->matchedLength;
  1131         else {
  1132             result = end > lastmatchindex;
  1135     return result;
  1138 /**
  1139 * Gets the collation element iterator's current offset.
  1140 * @param coleiter collation element iterator
  1141 * @param forwards flag TRUE if we are moving in th forwards direction
  1142 * @return current offset
  1143 */
  1144 static
  1145 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
  1146                                               UBool               forwards)
  1148     int32_t result = ucol_getOffset(coleiter);
  1149     // intricacies of the the backwards collation element iterator
  1150     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
  1151         result ++;
  1153     return result;
  1156 /**
  1157 * Checks match for contraction.
  1158 * If the match ends with a partial contraction we fail.
  1159 * If the match starts too far off (because of backwards iteration) we try to
  1160 * chip off the extra characters depending on whether a breakiterator has
  1161 * been used.
  1162 * Internal method, error assumed to be success, caller has to check status
  1163 * before calling this method.
  1164 * @param strsrch string search data
  1165 * @param start offset of potential match, to be modified if necessary
  1166 * @param end offset of potential match, to be modified if necessary
  1167 * @param status output error status if any
  1168 * @return TRUE if match passes the contraction test, FALSE otherwise
  1169 */
  1171 static
  1172 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
  1173                                      int32_t   *start,
  1174                                      int32_t   *end, UErrorCode  *status)
  1176           UCollationElements *coleiter   = strsrch->textIter;
  1177           int32_t             textlength = strsrch->search->textLength;
  1178           int32_t             temp       = *start;
  1179     const UCollator          *collator   = strsrch->collator;
  1180     const UChar              *text       = strsrch->search->text;
  1181     // This part checks if either ends of the match contains potential
  1182     // contraction. If so we'll have to iterate through them
  1183     // The start contraction needs to be checked since ucol_previous dumps
  1184     // all characters till the first safe character into the buffer.
  1185     // *start + 1 is used to test for the unsafe characters instead of *start
  1186     // because ucol_prev takes all unsafe characters till the first safe
  1187     // character ie *start. so by testing *start + 1, we can estimate if
  1188     // excess prefix characters has been included in the potential search
  1189     // results.
  1190     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
  1191         (*start + 1 < textlength
  1192          && ucol_unsafeCP(text[*start + 1], collator))) {
  1193         int32_t expansion  = getExpansionPrefix(coleiter);
  1194         UBool   expandflag = expansion > 0;
  1195         setColEIterOffset(coleiter, *start);
  1196         while (expansion > 0) {
  1197             // getting rid of the redundant ce, caused by setOffset.
  1198             // since backward contraction/expansion may have extra ces if we
  1199             // are in the normalization buffer, hasAccentsBeforeMatch would
  1200             // have taken care of it.
  1201             // E.g. the character \u01FA will have an expansion of 3, but if
  1202             // we are only looking for acute and ring \u030A and \u0301, we'll
  1203             // have to skip the first ce in the expansion buffer.
  1204             ucol_next(coleiter, status);
  1205             if (U_FAILURE(*status)) {
  1206                 return FALSE;
  1208             if (ucol_getOffset(coleiter) != temp) {
  1209                 *start = temp;
  1210                 temp  = ucol_getOffset(coleiter);
  1212             expansion --;
  1215         int32_t  *patternce       = strsrch->pattern.CE;
  1216         int32_t   patterncelength = strsrch->pattern.CELength;
  1217         int32_t   count           = 0;
  1218         while (count < patterncelength) {
  1219             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
  1220             if (ce == UCOL_IGNORABLE) {
  1221                 continue;
  1223             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
  1224                 *start = temp;
  1225                 temp   = ucol_getOffset(coleiter);
  1227             if (U_FAILURE(*status) || ce != patternce[count]) {
  1228                 (*end) ++;
  1229                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
  1230                 return FALSE;
  1232             count ++;
  1235     return TRUE;
  1238 /**
  1239 * Checks and sets the match information if found.
  1240 * Checks
  1241 * <ul>
  1242 * <li> the potential match does not repeat the previous match
  1243 * <li> boundaries are correct
  1244 * <li> exact matches has no extra accents
  1245 * <li> identical matchesb
  1246 * <li> potential match does not end in the middle of a contraction
  1247 * <\ul>
  1248 * Otherwise the offset will be shifted to the next character.
  1249 * Internal method, status assumed to be success, caller has to check status
  1250 * before calling this method.
  1251 * @param strsrch string search data
  1252 * @param textoffset offset in the collation element text. the returned value
  1253 *        will be the truncated end offset of the match or the new start
  1254 *        search offset.
  1255 * @param status output error status if any
  1256 * @return TRUE if the match is valid, FALSE otherwise
  1257 */
  1258 static
  1259 inline UBool checkNextExactMatch(UStringSearch *strsrch,
  1260                                  int32_t   *textoffset, UErrorCode *status)
  1262     UCollationElements *coleiter = strsrch->textIter;
  1263     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
  1265     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
  1266         return FALSE;
  1269     // this totally matches, however we need to check if it is repeating
  1270     if (!isBreakUnit(strsrch, start, *textoffset) ||
  1271         checkRepeatedMatch(strsrch, start, *textoffset) ||
  1272         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
  1273         !checkIdentical(strsrch, start, *textoffset) ||
  1274         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
  1276         (*textoffset) ++;
  1277         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
  1278         return FALSE;
  1281     //Add breakiterator boundary check for primary strength search.
  1282     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
  1283         checkBreakBoundary(strsrch, &start, textoffset);
  1286     // totally match, we will get rid of the ending ignorables.
  1287     strsrch->search->matchedIndex  = start;
  1288     strsrch->search->matchedLength = *textoffset - start;
  1289     return TRUE;
  1292 /**
  1293 * Getting the previous base character offset, or the current offset if the
  1294 * current character is a base character
  1295 * @param text string
  1296 * @param textoffset one offset after the current character
  1297 * @return the offset of the next character after the base character or the first
  1298 *         composed character with accents
  1299 */
  1300 static
  1301 inline int32_t getPreviousBaseOffset(const UChar       *text,
  1302                                                int32_t  textoffset)
  1304     if (textoffset > 0) {
  1305         for (;;) {
  1306             int32_t result = textoffset;
  1307             U16_BACK_1(text, 0, textoffset);
  1308             int32_t temp = textoffset;
  1309             uint16_t fcd = getFCD(text, &temp, result);
  1310             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
  1311                 if (fcd & LAST_BYTE_MASK_) {
  1312                     return textoffset;
  1314                 return result;
  1316             if (textoffset == 0) {
  1317                 return 0;
  1321     return textoffset;
  1324 /**
  1325 * Getting the indexes of the accents that are not blocked in the argument
  1326 * accent array
  1327 * @param accents array of accents in nfd terminated by a 0.
  1328 * @param accentsindex array of indexes of the accents that are not blocked
  1329 */
  1330 static
  1331 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
  1333     int32_t index     = 0;
  1334     int32_t     length    = u_strlen(accents);
  1335     UChar32     codepoint = 0;
  1336     int         cclass    = 0;
  1337     int         result    = 0;
  1338     int32_t temp;
  1339     while (index < length) {
  1340         temp = index;
  1341         U16_NEXT(accents, index, length, codepoint);
  1342         if (u_getCombiningClass(codepoint) != cclass) {
  1343             cclass        = u_getCombiningClass(codepoint);
  1344             accentsindex[result] = temp;
  1345             result ++;
  1348     accentsindex[result] = length;
  1349     return result;
  1352 /**
  1353 * Appends 3 UChar arrays to a destination array.
  1354 * Creates a new array if we run out of space. The caller will have to
  1355 * manually deallocate the newly allocated array.
  1356 * Internal method, status assumed to be success, caller has to check status
  1357 * before calling this method. destination not to be NULL and has at least
  1358 * size destinationlength.
  1359 * @param destination target array
  1360 * @param destinationlength target array size, returning the appended length
  1361 * @param source1 null-terminated first array
  1362 * @param source2 second array
  1363 * @param source2length length of seond array
  1364 * @param source3 null-terminated third array
  1365 * @param status error status if any
  1366 * @return new destination array, destination if there was no new allocation
  1367 */
  1368 static
  1369 inline UChar * addToUCharArray(      UChar      *destination,
  1370                                      int32_t    *destinationlength,
  1371                                const UChar      *source1,
  1372                                const UChar      *source2,
  1373                                      int32_t     source2length,
  1374                                const UChar      *source3,
  1375                                      UErrorCode *status)
  1377     int32_t source1length = source1 ? u_strlen(source1) : 0;
  1378     int32_t source3length = source3 ? u_strlen(source3) : 0;
  1379     if (*destinationlength < source1length + source2length + source3length +
  1380                                                                            1)
  1382         destination = (UChar *)allocateMemory(
  1383           (source1length + source2length + source3length + 1) * sizeof(UChar),
  1384           status);
  1385         // if error allocating memory, status will be
  1386         // U_MEMORY_ALLOCATION_ERROR
  1387         if (U_FAILURE(*status)) {
  1388             *destinationlength = 0;
  1389             return NULL;
  1392     if (source1length != 0) {
  1393         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
  1395     if (source2length != 0) {
  1396         uprv_memcpy(destination + source1length, source2,
  1397                     sizeof(UChar) * source2length);
  1399     if (source3length != 0) {
  1400         uprv_memcpy(destination + source1length + source2length, source3,
  1401                     sizeof(UChar) * source3length);
  1403     *destinationlength = source1length + source2length + source3length;
  1404     return destination;
  1407 /**
  1408 * Running through a collation element iterator to see if the contents matches
  1409 * pattern in string search data
  1410 * @param strsrch string search data
  1411 * @param coleiter collation element iterator
  1412 * @return TRUE if a match if found, FALSE otherwise
  1413 */
  1414 static
  1415 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
  1416                                        UCollationElements *coleiter)
  1418     int         patternceindex = strsrch->pattern.CELength;
  1419     int32_t    *patternce      = strsrch->pattern.CE;
  1420     UErrorCode  status = U_ZERO_ERROR;
  1421     while (patternceindex > 0) {
  1422         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
  1423         if (ce == UCOL_IGNORABLE) {
  1424             continue;
  1426         if (U_FAILURE(status) || ce != *patternce) {
  1427             return FALSE;
  1429         patternce ++;
  1430         patternceindex --;
  1432     return TRUE;
  1435 /**
  1436 * Rearranges the front accents to try matching.
  1437 * Prefix accents in the text will be grouped according to their combining
  1438 * class and the groups will be mixed and matched to try find the perfect
  1439 * match with the pattern.
  1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
  1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
  1442 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
  1443 *         "\u0301\u0325".
  1444 * step 2: check if any of the generated substrings matches the pattern.
  1445 * Internal method, status is assumed to be success, caller has to check status
  1446 * before calling this method.
  1447 * @param strsrch string search match
  1448 * @param start first offset of the accents to start searching
  1449 * @param end start of the last accent set
  1450 * @param status output error status if any
  1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting
  1452 *         offset of the match. Note this start includes all preceding accents.
  1453 */
  1454 static
  1455 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
  1456                                        int32_t    start,
  1457                                        int32_t    end,
  1458                                        UErrorCode    *status)
  1460     const UChar       *text       = strsrch->search->text;
  1461           int32_t      textlength = strsrch->search->textLength;
  1462           int32_t  tempstart  = start;
  1464     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
  1465         // die... failed at a base character
  1466         return USEARCH_DONE;
  1469     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
  1470     start = getPreviousBaseOffset(text, tempstart);
  1472     UChar       accents[INITIAL_ARRAY_SIZE_];
  1473     // normalizing the offensive string
  1474     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
  1475                     INITIAL_ARRAY_SIZE_, status);
  1476     if (U_FAILURE(*status)) {
  1477         return USEARCH_DONE;
  1480     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
  1481     int32_t         accentsize = getUnblockedAccentIndex(accents,
  1482                                                                  accentsindex);
  1483     int32_t         count      = (2 << (accentsize - 1)) - 1;
  1484     UChar               buffer[INITIAL_ARRAY_SIZE_];
  1485     UCollationElements *coleiter   = strsrch->utilIter;
  1486     while (U_SUCCESS(*status) && count > 0) {
  1487         UChar *rearrange = strsrch->canonicalPrefixAccents;
  1488         // copy the base characters
  1489         for (int k = 0; k < accentsindex[0]; k ++) {
  1490             *rearrange ++ = accents[k];
  1492         // forming all possible canonical rearrangement by dropping
  1493         // sets of accents
  1494         for (int i = 0; i <= accentsize - 1; i ++) {
  1495             int32_t mask = 1 << (accentsize - i - 1);
  1496             if (count & mask) {
  1497                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
  1498                     *rearrange ++ = accents[j];
  1502         *rearrange = 0;
  1503         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
  1504         UChar   *match     = addToUCharArray(buffer, &matchsize,
  1505                                            strsrch->canonicalPrefixAccents,
  1506                                            strsrch->search->text + offset,
  1507                                            end - offset,
  1508                                            strsrch->canonicalSuffixAccents,
  1509                                            status);
  1511         // if status is a failure, ucol_setText does nothing.
  1512         // run the collator iterator through this match
  1513         ucol_setText(coleiter, match, matchsize, status);
  1514         if (U_SUCCESS(*status)) {
  1515             if (checkCollationMatch(strsrch, coleiter)) {
  1516                 if (match != buffer) {
  1517                     uprv_free(match);
  1519                 return start;
  1522         count --;
  1524     return USEARCH_DONE;
  1527 /**
  1528 * Gets the offset to the safe point in text before textoffset.
  1529 * ie. not the middle of a contraction, swappable characters or supplementary
  1530 * characters.
  1531 * @param collator collation sata
  1532 * @param text string to work with
  1533 * @param textoffset offset in string
  1534 * @param textlength length of text string
  1535 * @return offset to the previous safe character
  1536 */
  1537 static
  1538 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
  1539                                       const UChar       *text,
  1540                                             int32_t  textoffset)
  1542     int32_t result = textoffset; // first contraction character
  1543     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
  1544         result --;
  1546     if (result != 0) {
  1547         // the first contraction character is consider unsafe here
  1548         result --;
  1550     return result;
  1553 /**
  1554 * Cleaning up after we passed the safe zone
  1555 * @param strsrch string search data
  1556 * @param safetext safe text array
  1557 * @param safebuffer safe text buffer
  1558 * @param coleiter collation element iterator for safe text
  1559 */
  1560 static
  1561 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
  1562                                   UChar         *safebuffer)
  1564     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
  1566        uprv_free(safetext);
  1570 /**
  1571 * Take the rearranged end accents and tries matching. If match failed at
  1572 * a seperate preceding set of accents (seperated from the rearranged on by
  1573 * at least a base character) then we rearrange the preceding accents and
  1574 * tries matching again.
  1575 * We allow skipping of the ends of the accent set if the ces do not match.
  1576 * However if the failure is found before the accent set, it fails.
  1577 * Internal method, status assumed to be success, caller has to check status
  1578 * before calling this method.
  1579 * @param strsrch string search data
  1580 * @param textoffset of the start of the rearranged accent
  1581 * @param status output error status if any
  1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting
  1583 *         offset of the match. Note this start includes all preceding accents.
  1584 */
  1585 static
  1586 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
  1587                                        int32_t    textoffset,
  1588                                        UErrorCode    *status)
  1590     const UChar              *text           = strsrch->search->text;
  1591     const UCollator          *collator       = strsrch->collator;
  1592           int32_t             safelength     = 0;
  1593           UChar              *safetext;
  1594           int32_t             safetextlength;
  1595           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
  1596           UCollationElements *coleiter       = strsrch->utilIter;
  1597           int32_t         safeoffset     = textoffset;
  1599     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
  1600                                          collator)) {
  1601         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
  1602         safelength     = textoffset - safeoffset;
  1603         safetextlength = INITIAL_ARRAY_SIZE_;
  1604         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
  1605                                          text + safeoffset, safelength,
  1606                                          strsrch->canonicalSuffixAccents,
  1607                                          status);
  1609     else {
  1610         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
  1611         safetext       = strsrch->canonicalSuffixAccents;
  1614     // if status is a failure, ucol_setText does nothing
  1615     ucol_setText(coleiter, safetext, safetextlength, status);
  1616     // status checked in loop below
  1618     int32_t  *ce        = strsrch->pattern.CE;
  1619     int32_t   celength  = strsrch->pattern.CELength;
  1620     int       ceindex   = celength - 1;
  1621     UBool     isSafe    = TRUE; // indication flag for position in safe zone
  1623     while (ceindex >= 0) {
  1624         int32_t textce = ucol_previous(coleiter, status);
  1625         if (U_FAILURE(*status)) {
  1626             if (isSafe) {
  1627                 cleanUpSafeText(strsrch, safetext, safebuffer);
  1629             return USEARCH_DONE;
  1631         if (textce == UCOL_NULLORDER) {
  1632             // check if we have passed the safe buffer
  1633             if (coleiter == strsrch->textIter) {
  1634                 cleanUpSafeText(strsrch, safetext, safebuffer);
  1635                 return USEARCH_DONE;
  1637             cleanUpSafeText(strsrch, safetext, safebuffer);
  1638             safetext = safebuffer;
  1639             coleiter = strsrch->textIter;
  1640             setColEIterOffset(coleiter, safeoffset);
  1641             // status checked at the start of the loop
  1642             isSafe = FALSE;
  1643             continue;
  1645         textce = getCE(strsrch, textce);
  1646         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
  1647             // do the beginning stuff
  1648             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
  1649             if (isSafe && failedoffset >= safelength) {
  1650                 // alas... no hope. failed at rearranged accent set
  1651                 cleanUpSafeText(strsrch, safetext, safebuffer);
  1652                 return USEARCH_DONE;
  1654             else {
  1655                 if (isSafe) {
  1656                     failedoffset += safeoffset;
  1657                     cleanUpSafeText(strsrch, safetext, safebuffer);
  1660                 // try rearranging the front accents
  1661                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
  1662                                         failedoffset, textoffset, status);
  1663                 if (result != USEARCH_DONE) {
  1664                     // if status is a failure, ucol_setOffset does nothing
  1665                     setColEIterOffset(strsrch->textIter, result);
  1667                 if (U_FAILURE(*status)) {
  1668                     return USEARCH_DONE;
  1670                 return result;
  1673         if (textce == ce[ceindex]) {
  1674             ceindex --;
  1677     // set offset here
  1678     if (isSafe) {
  1679         int32_t result     = getColElemIterOffset(coleiter, FALSE);
  1680         // sets the text iterator here with the correct expansion and offset
  1681         int32_t    leftoverces = getExpansionPrefix(coleiter);
  1682         cleanUpSafeText(strsrch, safetext, safebuffer);
  1683         if (result >= safelength) {
  1684             result = textoffset;
  1686         else {
  1687             result += safeoffset;
  1689         setColEIterOffset(strsrch->textIter, result);
  1690         strsrch->textIter->iteratordata_.toReturn =
  1691                        setExpansionPrefix(strsrch->textIter, leftoverces);
  1692         return result;
  1695     return ucol_getOffset(coleiter);
  1698 /**
  1699 * Trying out the substring and sees if it can be a canonical match.
  1700 * This will try normalizing the end accents and arranging them into canonical
  1701 * equivalents and check their corresponding ces with the pattern ce.
  1702 * Suffix accents in the text will be grouped according to their combining
  1703 * class and the groups will be mixed and matched to try find the perfect
  1704 * match with the pattern.
  1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
  1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
  1707 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
  1708 *         "\u0301\u0325".
  1709 * step 2: check if any of the generated substrings matches the pattern.
  1710 * Internal method, status assumed to be success, caller has to check status
  1711 * before calling this method.
  1712 * @param strsrch string search data
  1713 * @param textoffset end offset in the collation element text that ends with
  1714 *                   the accents to be rearranged
  1715 * @param status error status if any
  1716 * @return TRUE if the match is valid, FALSE otherwise
  1717 */
  1718 static
  1719 UBool doNextCanonicalMatch(UStringSearch *strsrch,
  1720                            int32_t    textoffset,
  1721                            UErrorCode    *status)
  1723     const UChar       *text = strsrch->search->text;
  1724           int32_t  temp = textoffset;
  1725     U16_BACK_1(text, 0, temp);
  1726     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
  1727         UCollationElements *coleiter = strsrch->textIter;
  1728         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
  1729         if (strsrch->pattern.hasPrefixAccents) {
  1730             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
  1731                                                 status);
  1732             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
  1733                 setColEIterOffset(coleiter, offset);
  1734                 return TRUE;
  1737         return FALSE;
  1740     if (!strsrch->pattern.hasSuffixAccents) {
  1741         return FALSE;
  1744     UChar       accents[INITIAL_ARRAY_SIZE_];
  1745     // offset to the last base character in substring to search
  1746     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
  1747     // normalizing the offensive string
  1748     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
  1749                                0, accents, INITIAL_ARRAY_SIZE_, status);
  1750     // status checked in loop below
  1752     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
  1753     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
  1755     // 2 power n - 1 plus the full set of accents
  1756     int32_t  count = (2 << (size - 1)) - 1;
  1757     while (U_SUCCESS(*status) && count > 0) {
  1758         UChar *rearrange = strsrch->canonicalSuffixAccents;
  1759         // copy the base characters
  1760         for (int k = 0; k < accentsindex[0]; k ++) {
  1761             *rearrange ++ = accents[k];
  1763         // forming all possible canonical rearrangement by dropping
  1764         // sets of accents
  1765         for (int i = 0; i <= size - 1; i ++) {
  1766             int32_t mask = 1 << (size - i - 1);
  1767             if (count & mask) {
  1768                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
  1769                     *rearrange ++ = accents[j];
  1773         *rearrange = 0;
  1774         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
  1775                                                         status);
  1776         if (offset != USEARCH_DONE) {
  1777             return TRUE; // match found
  1779         count --;
  1781     return FALSE;
  1784 /**
  1785 * Gets the previous base character offset depending on the string search
  1786 * pattern data
  1787 * @param strsrch string search data
  1788 * @param textoffset current offset, current character
  1789 * @return the offset of the next character after this base character or itself
  1790 *         if it is a composed character with accents
  1791 */
  1792 static
  1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
  1794                                                       int32_t textoffset)
  1796     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
  1797         const UChar       *text = strsrch->search->text;
  1798               int32_t  offset = textoffset;
  1799         if (getFCD(text, &offset, strsrch->search->textLength) >>
  1800                                                    SECOND_LAST_BYTE_SHIFT_) {
  1801             return getPreviousBaseOffset(text, textoffset);
  1804     return textoffset;
  1807 /**
  1808 * Checks match for contraction.
  1809 * If the match ends with a partial contraction we fail.
  1810 * If the match starts too far off (because of backwards iteration) we try to
  1811 * chip off the extra characters
  1812 * Internal method, status assumed to be success, caller has to check status
  1813 * before calling this method.
  1814 * @param strsrch string search data
  1815 * @param start offset of potential match, to be modified if necessary
  1816 * @param end offset of potential match, to be modified if necessary
  1817 * @param status output error status if any
  1818 * @return TRUE if match passes the contraction test, FALSE otherwise
  1819 */
  1820 static
  1821 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
  1822                                          int32_t   *start,
  1823                                          int32_t   *end,
  1824                                          UErrorCode    *status)
  1826           UCollationElements *coleiter   = strsrch->textIter;
  1827           int32_t             textlength = strsrch->search->textLength;
  1828           int32_t         temp       = *start;
  1829     const UCollator          *collator   = strsrch->collator;
  1830     const UChar              *text       = strsrch->search->text;
  1831     // This part checks if either ends of the match contains potential
  1832     // contraction. If so we'll have to iterate through them
  1833     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
  1834         (*start + 1 < textlength
  1835          && ucol_unsafeCP(text[*start + 1], collator))) {
  1836         int32_t expansion  = getExpansionPrefix(coleiter);
  1837         UBool   expandflag = expansion > 0;
  1838         setColEIterOffset(coleiter, *start);
  1839         while (expansion > 0) {
  1840             // getting rid of the redundant ce, caused by setOffset.
  1841             // since backward contraction/expansion may have extra ces if we
  1842             // are in the normalization buffer, hasAccentsBeforeMatch would
  1843             // have taken care of it.
  1844             // E.g. the character \u01FA will have an expansion of 3, but if
  1845             // we are only looking for acute and ring \u030A and \u0301, we'll
  1846             // have to skip the first ce in the expansion buffer.
  1847             ucol_next(coleiter, status);
  1848             if (U_FAILURE(*status)) {
  1849                 return FALSE;
  1851             if (ucol_getOffset(coleiter) != temp) {
  1852                 *start = temp;
  1853                 temp  = ucol_getOffset(coleiter);
  1855             expansion --;
  1858         int32_t  *patternce       = strsrch->pattern.CE;
  1859         int32_t   patterncelength = strsrch->pattern.CELength;
  1860         int32_t   count           = 0;
  1861         int32_t   textlength      = strsrch->search->textLength;
  1862         while (count < patterncelength) {
  1863             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
  1864             // status checked below, note that if status is a failure
  1865             // ucol_next returns UCOL_NULLORDER
  1866             if (ce == UCOL_IGNORABLE) {
  1867                 continue;
  1869             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
  1870                 *start = temp;
  1871                 temp   = ucol_getOffset(coleiter);
  1874             if (count == 0 && ce != patternce[0]) {
  1875                 // accents may have extra starting ces, this occurs when a
  1876                 // pure accent pattern is matched without rearrangement
  1877                 // text \u0325\u0300 and looking for \u0300
  1878                 int32_t expected = patternce[0];
  1879                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
  1880                     ce = getCE(strsrch, ucol_next(coleiter, status));
  1881                     while (U_SUCCESS(*status) && ce != expected &&
  1882                            ce != UCOL_NULLORDER &&
  1883                            ucol_getOffset(coleiter) <= *end) {
  1884                         ce = getCE(strsrch, ucol_next(coleiter, status));
  1888             if (U_FAILURE(*status) || ce != patternce[count]) {
  1889                 (*end) ++;
  1890                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
  1891                 return FALSE;
  1893             count ++;
  1896     return TRUE;
  1899 /**
  1900 * Checks and sets the match information if found.
  1901 * Checks
  1902 * <ul>
  1903 * <li> the potential match does not repeat the previous match
  1904 * <li> boundaries are correct
  1905 * <li> potential match does not end in the middle of a contraction
  1906 * <li> identical matches
  1907 * <\ul>
  1908 * Otherwise the offset will be shifted to the next character.
  1909 * Internal method, status assumed to be success, caller has to check the
  1910 * status before calling this method.
  1911 * @param strsrch string search data
  1912 * @param textoffset offset in the collation element text. the returned value
  1913 *        will be the truncated end offset of the match or the new start
  1914 *        search offset.
  1915 * @param status output error status if any
  1916 * @return TRUE if the match is valid, FALSE otherwise
  1917 */
  1918 static
  1919 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
  1920                                      int32_t   *textoffset,
  1921                                      UErrorCode    *status)
  1923     // to ensure that the start and ends are not composite characters
  1924     UCollationElements *coleiter = strsrch->textIter;
  1925     // if we have a canonical accent match
  1926     if ((strsrch->pattern.hasSuffixAccents &&
  1927         strsrch->canonicalSuffixAccents[0]) ||
  1928         (strsrch->pattern.hasPrefixAccents &&
  1929         strsrch->canonicalPrefixAccents[0])) {
  1930         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
  1931                                                     strsrch,
  1932                                                     ucol_getOffset(coleiter));
  1933         strsrch->search->matchedLength = *textoffset -
  1934                                                 strsrch->search->matchedIndex;
  1935         return TRUE;
  1938     int32_t start = getColElemIterOffset(coleiter, FALSE);
  1939     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
  1940                                             status) || U_FAILURE(*status)) {
  1941         return FALSE;
  1944     start = getPreviousUStringSearchBaseOffset(strsrch, start);
  1945     // this totally matches, however we need to check if it is repeating
  1946     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
  1947         !isBreakUnit(strsrch, start, *textoffset) ||
  1948         !checkIdentical(strsrch, start, *textoffset)) {
  1949         (*textoffset) ++;
  1950         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
  1951                                         strsrch->search->textLength);
  1952         return FALSE;
  1955     strsrch->search->matchedIndex  = start;
  1956     strsrch->search->matchedLength = *textoffset - start;
  1957     return TRUE;
  1960 /**
  1961 * Shifting the collation element iterator position forward to prepare for
  1962 * a preceding match. If the first character is a unsafe character, we'll only
  1963 * shift by 1 to capture contractions, normalization etc.
  1964 * Internal method, status assumed to be success, caller has to check status
  1965 * before calling this method.
  1966 * @param text strsrch string search data
  1967 * @param textoffset start text position to do search
  1968 * @param ce the text ce which failed the match.
  1969 * @param patternceindex index of the ce within the pattern ce buffer which
  1970 *        failed the match
  1971 * @return final offset
  1972 */
  1973 static
  1974 inline int32_t reverseShift(UStringSearch *strsrch,
  1975                                 int32_t    textoffset,
  1976                                 int32_t       ce,
  1977                                 int32_t        patternceindex)
  1979     if (strsrch->search->isOverlap) {
  1980         if (textoffset != strsrch->search->textLength) {
  1981             textoffset --;
  1983         else {
  1984             textoffset -= strsrch->pattern.defaultShiftSize;
  1987     else {
  1988         if (ce != UCOL_NULLORDER) {
  1989             int32_t shift = strsrch->pattern.backShift[hash(ce)];
  1991             // this is to adjust for characters in the middle of the substring
  1992             // for matching that failed.
  1993             int32_t adjust = patternceindex;
  1994             if (adjust > 1 && shift > adjust) {
  1995                 shift -= adjust - 1;
  1997             textoffset -= shift;
  1999         else {
  2000             textoffset -= strsrch->pattern.defaultShiftSize;
  2003     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
  2004     return textoffset;
  2007 /**
  2008 * Checks match for contraction.
  2009 * If the match starts with a partial contraction we fail.
  2010 * Internal method, status assumed to be success, caller has to check status
  2011 * before calling this method.
  2012 * @param strsrch string search data
  2013 * @param start offset of potential match, to be modified if necessary
  2014 * @param end offset of potential match, to be modified if necessary
  2015 * @param status output error status if any
  2016 * @return TRUE if match passes the contraction test, FALSE otherwise
  2017 */
  2018 static
  2019 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
  2020                                      int32_t   *start,
  2021                                      int32_t   *end, UErrorCode  *status)
  2023           UCollationElements *coleiter   = strsrch->textIter;
  2024           int32_t             textlength = strsrch->search->textLength;
  2025           int32_t             temp       = *end;
  2026     const UCollator          *collator   = strsrch->collator;
  2027     const UChar              *text       = strsrch->search->text;
  2028     // This part checks if either if the start of the match contains potential
  2029     // contraction. If so we'll have to iterate through them
  2030     // Since we used ucol_next while previously looking for the potential
  2031     // match, this guarantees that our end will not be a partial contraction,
  2032     // or a partial supplementary character.
  2033     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
  2034         int32_t expansion  = getExpansionSuffix(coleiter);
  2035         UBool   expandflag = expansion > 0;
  2036         setColEIterOffset(coleiter, *end);
  2037         while (U_SUCCESS(*status) && expansion > 0) {
  2038             // getting rid of the redundant ce
  2039             // since forward contraction/expansion may have extra ces
  2040             // if we are in the normalization buffer, hasAccentsBeforeMatch
  2041             // would have taken care of it.
  2042             // E.g. the character \u01FA will have an expansion of 3, but if
  2043             // we are only looking for A ring A\u030A, we'll have to skip the
  2044             // last ce in the expansion buffer
  2045             ucol_previous(coleiter, status);
  2046             if (U_FAILURE(*status)) {
  2047                 return FALSE;
  2049             if (ucol_getOffset(coleiter) != temp) {
  2050                 *end = temp;
  2051                 temp  = ucol_getOffset(coleiter);
  2053             expansion --;
  2056         int32_t  *patternce       = strsrch->pattern.CE;
  2057         int32_t   patterncelength = strsrch->pattern.CELength;
  2058         int32_t   count           = patterncelength;
  2059         while (count > 0) {
  2060             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
  2061             // status checked below, note that if status is a failure
  2062             // ucol_previous returns UCOL_NULLORDER
  2063             if (ce == UCOL_IGNORABLE) {
  2064                 continue;
  2066             if (expandflag && count == 0 &&
  2067                 getColElemIterOffset(coleiter, FALSE) != temp) {
  2068                 *end = temp;
  2069                 temp  = ucol_getOffset(coleiter);
  2071             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
  2072                 (*start) --;
  2073                 *start = getPreviousBaseOffset(text, *start);
  2074                 return FALSE;
  2076             count --;
  2079     return TRUE;
  2082 /**
  2083 * Checks and sets the match information if found.
  2084 * Checks
  2085 * <ul>
  2086 * <li> the current match does not repeat the last match
  2087 * <li> boundaries are correct
  2088 * <li> exact matches has no extra accents
  2089 * <li> identical matches
  2090 * <\ul>
  2091 * Otherwise the offset will be shifted to the preceding character.
  2092 * Internal method, status assumed to be success, caller has to check status
  2093 * before calling this method.
  2094 * @param strsrch string search data
  2095 * @param collator
  2096 * @param coleiter collation element iterator
  2097 * @param text string
  2098 * @param textoffset offset in the collation element text. the returned value
  2099 *        will be the truncated start offset of the match or the new start
  2100 *        search offset.
  2101 * @param status output error status if any
  2102 * @return TRUE if the match is valid, FALSE otherwise
  2103 */
  2104 static
  2105 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
  2106                                      int32_t   *textoffset,
  2107                                      UErrorCode    *status)
  2109     // to ensure that the start and ends are not composite characters
  2110     int32_t end = ucol_getOffset(strsrch->textIter);
  2111     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
  2112         || U_FAILURE(*status)) {
  2113             return FALSE;
  2116     // this totally matches, however we need to check if it is repeating
  2117     // the old match
  2118     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
  2119         !isBreakUnit(strsrch, *textoffset, end) ||
  2120         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
  2121         !checkIdentical(strsrch, *textoffset, end) ||
  2122         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
  2123         (*textoffset) --;
  2124         *textoffset = getPreviousBaseOffset(strsrch->search->text,
  2125                                             *textoffset);
  2126         return FALSE;
  2129     //Add breakiterator boundary check for primary strength search.
  2130     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
  2131         checkBreakBoundary(strsrch, textoffset, &end);
  2134     strsrch->search->matchedIndex = *textoffset;
  2135     strsrch->search->matchedLength = end - *textoffset;
  2136     return TRUE;
  2139 /**
  2140 * Rearranges the end accents to try matching.
  2141 * Suffix accents in the text will be grouped according to their combining
  2142 * class and the groups will be mixed and matched to try find the perfect
  2143 * match with the pattern.
  2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
  2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
  2146 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
  2147 *         "\u0301\u0325".
  2148 * step 2: check if any of the generated substrings matches the pattern.
  2149 * Internal method, status assumed to be success, user has to check status
  2150 * before calling this method.
  2151 * @param strsrch string search match
  2152 * @param start offset of the first base character
  2153 * @param end start of the last accent set
  2154 * @param status only error status if any
  2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending
  2156 *         offset of the match. Note this start includes all following accents.
  2157 */
  2158 static
  2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
  2160                                            int32_t    start,
  2161                                            int32_t    end,
  2162                                            UErrorCode    *status)
  2164     const UChar       *text       = strsrch->search->text;
  2165           int32_t  tempend    = end;
  2167     U16_BACK_1(text, 0, tempend);
  2168     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
  2169                                                            LAST_BYTE_MASK_)) {
  2170         // die... failed at a base character
  2171         return USEARCH_DONE;
  2173     end = getNextBaseOffset(text, end, strsrch->search->textLength);
  2175     if (U_SUCCESS(*status)) {
  2176         UChar       accents[INITIAL_ARRAY_SIZE_];
  2177         int32_t offset = getPreviousBaseOffset(text, end);
  2178         // normalizing the offensive string
  2179         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
  2180                         INITIAL_ARRAY_SIZE_, status);
  2182         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
  2183         int32_t         accentsize = getUnblockedAccentIndex(accents,
  2184                                                          accentsindex);
  2185         int32_t         count      = (2 << (accentsize - 1)) - 1;
  2186         UChar               buffer[INITIAL_ARRAY_SIZE_];
  2187         UCollationElements *coleiter = strsrch->utilIter;
  2188         while (U_SUCCESS(*status) && count > 0) {
  2189             UChar *rearrange = strsrch->canonicalSuffixAccents;
  2190             // copy the base characters
  2191             for (int k = 0; k < accentsindex[0]; k ++) {
  2192                 *rearrange ++ = accents[k];
  2194             // forming all possible canonical rearrangement by dropping
  2195             // sets of accents
  2196             for (int i = 0; i <= accentsize - 1; i ++) {
  2197                 int32_t mask = 1 << (accentsize - i - 1);
  2198                 if (count & mask) {
  2199                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
  2200                         *rearrange ++ = accents[j];
  2204             *rearrange = 0;
  2205             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
  2206             UChar   *match     = addToUCharArray(buffer, &matchsize,
  2207                                            strsrch->canonicalPrefixAccents,
  2208                                            strsrch->search->text + start,
  2209                                            offset - start,
  2210                                            strsrch->canonicalSuffixAccents,
  2211                                            status);
  2213             // run the collator iterator through this match
  2214             // if status is a failure ucol_setText does nothing
  2215             ucol_setText(coleiter, match, matchsize, status);
  2216             if (U_SUCCESS(*status)) {
  2217                 if (checkCollationMatch(strsrch, coleiter)) {
  2218                     if (match != buffer) {
  2219                         uprv_free(match);
  2221                     return end;
  2224             count --;
  2227     return USEARCH_DONE;
  2230 /**
  2231 * Take the rearranged start accents and tries matching. If match failed at
  2232 * a seperate following set of accents (seperated from the rearranged on by
  2233 * at least a base character) then we rearrange the preceding accents and
  2234 * tries matching again.
  2235 * We allow skipping of the ends of the accent set if the ces do not match.
  2236 * However if the failure is found before the accent set, it fails.
  2237 * Internal method, status assumed to be success, caller has to check status
  2238 * before calling this method.
  2239 * @param strsrch string search data
  2240 * @param textoffset of the ends of the rearranged accent
  2241 * @param status output error status if any
  2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending
  2243 *         offset of the match. Note this start includes all following accents.
  2244 */
  2245 static
  2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
  2247                                            int32_t    textoffset,
  2248                                            UErrorCode    *status)
  2250     const UChar       *text       = strsrch->search->text;
  2251     const UCollator   *collator   = strsrch->collator;
  2252           int32_t      safelength = 0;
  2253           UChar       *safetext;
  2254           int32_t      safetextlength;
  2255           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
  2256           int32_t  safeoffset = textoffset;
  2258     if (textoffset &&
  2259         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
  2260                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
  2261                                          ], collator)) {
  2262         safeoffset     = getNextSafeOffset(collator, text, textoffset,
  2263                                            strsrch->search->textLength);
  2264         safelength     = safeoffset - textoffset;
  2265         safetextlength = INITIAL_ARRAY_SIZE_;
  2266         safetext       = addToUCharArray(safebuffer, &safetextlength,
  2267                                          strsrch->canonicalPrefixAccents,
  2268                                          text + textoffset, safelength,
  2269                                          NULL, status);
  2271     else {
  2272         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
  2273         safetext       = strsrch->canonicalPrefixAccents;
  2276     UCollationElements *coleiter = strsrch->utilIter;
  2277      // if status is a failure, ucol_setText does nothing
  2278     ucol_setText(coleiter, safetext, safetextlength, status);
  2279     // status checked in loop below
  2281     int32_t  *ce           = strsrch->pattern.CE;
  2282     int32_t   celength     = strsrch->pattern.CELength;
  2283     int       ceindex      = 0;
  2284     UBool     isSafe       = TRUE; // safe zone indication flag for position
  2285     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
  2287     while (ceindex < celength) {
  2288         int32_t textce = ucol_next(coleiter, status);
  2289         if (U_FAILURE(*status)) {
  2290             if (isSafe) {
  2291                 cleanUpSafeText(strsrch, safetext, safebuffer);
  2293             return USEARCH_DONE;
  2295         if (textce == UCOL_NULLORDER) {
  2296             // check if we have passed the safe buffer
  2297             if (coleiter == strsrch->textIter) {
  2298                 cleanUpSafeText(strsrch, safetext, safebuffer);
  2299                 return USEARCH_DONE;
  2301             cleanUpSafeText(strsrch, safetext, safebuffer);
  2302             safetext = safebuffer;
  2303             coleiter = strsrch->textIter;
  2304             setColEIterOffset(coleiter, safeoffset);
  2305             // status checked at the start of the loop
  2306             isSafe = FALSE;
  2307             continue;
  2309         textce = getCE(strsrch, textce);
  2310         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
  2311             // do the beginning stuff
  2312             int32_t failedoffset = ucol_getOffset(coleiter);
  2313             if (isSafe && failedoffset <= prefixlength) {
  2314                 // alas... no hope. failed at rearranged accent set
  2315                 cleanUpSafeText(strsrch, safetext, safebuffer);
  2316                 return USEARCH_DONE;
  2318             else {
  2319                 if (isSafe) {
  2320                     failedoffset = safeoffset - failedoffset;
  2321                     cleanUpSafeText(strsrch, safetext, safebuffer);
  2324                 // try rearranging the end accents
  2325                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
  2326                                         textoffset, failedoffset, status);
  2327                 if (result != USEARCH_DONE) {
  2328                     // if status is a failure, ucol_setOffset does nothing
  2329                     setColEIterOffset(strsrch->textIter, result);
  2331                 if (U_FAILURE(*status)) {
  2332                     return USEARCH_DONE;
  2334                 return result;
  2337         if (textce == ce[ceindex]) {
  2338             ceindex ++;
  2341     // set offset here
  2342     if (isSafe) {
  2343         int32_t result      = ucol_getOffset(coleiter);
  2344         // sets the text iterator here with the correct expansion and offset
  2345         int32_t     leftoverces = getExpansionSuffix(coleiter);
  2346         cleanUpSafeText(strsrch, safetext, safebuffer);
  2347         if (result <= prefixlength) {
  2348             result = textoffset;
  2350         else {
  2351             result = textoffset + (safeoffset - result);
  2353         setColEIterOffset(strsrch->textIter, result);
  2354         setExpansionSuffix(strsrch->textIter, leftoverces);
  2355         return result;
  2358     return ucol_getOffset(coleiter);
  2361 /**
  2362 * Trying out the substring and sees if it can be a canonical match.
  2363 * This will try normalizing the starting accents and arranging them into
  2364 * canonical equivalents and check their corresponding ces with the pattern ce.
  2365 * Prefix accents in the text will be grouped according to their combining
  2366 * class and the groups will be mixed and matched to try find the perfect
  2367 * match with the pattern.
  2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
  2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
  2370 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
  2371 *         "\u0301\u0325".
  2372 * step 2: check if any of the generated substrings matches the pattern.
  2373 * Internal method, status assumed to be success, caller has to check status
  2374 * before calling this method.
  2375 * @param strsrch string search data
  2376 * @param textoffset start offset in the collation element text that starts
  2377 *                   with the accents to be rearranged
  2378 * @param status output error status if any
  2379 * @return TRUE if the match is valid, FALSE otherwise
  2380 */
  2381 static
  2382 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
  2383                                int32_t    textoffset,
  2384                                UErrorCode    *status)
  2386     const UChar       *text       = strsrch->search->text;
  2387           int32_t  temp       = textoffset;
  2388           int32_t      textlength = strsrch->search->textLength;
  2389     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
  2390         UCollationElements *coleiter = strsrch->textIter;
  2391         int32_t         offset   = ucol_getOffset(coleiter);
  2392         if (strsrch->pattern.hasSuffixAccents) {
  2393             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
  2394                                                     offset, status);
  2395             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
  2396                 setColEIterOffset(coleiter, offset);
  2397                 return TRUE;
  2400         return FALSE;
  2403     if (!strsrch->pattern.hasPrefixAccents) {
  2404         return FALSE;
  2407     UChar       accents[INITIAL_ARRAY_SIZE_];
  2408     // offset to the last base character in substring to search
  2409     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
  2410     // normalizing the offensive string
  2411     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
  2412                                0, accents, INITIAL_ARRAY_SIZE_, status);
  2413     // status checked in loop
  2415     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
  2416     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
  2418     // 2 power n - 1 plus the full set of accents
  2419     int32_t  count = (2 << (size - 1)) - 1;
  2420     while (U_SUCCESS(*status) && count > 0) {
  2421         UChar *rearrange = strsrch->canonicalPrefixAccents;
  2422         // copy the base characters
  2423         for (int k = 0; k < accentsindex[0]; k ++) {
  2424             *rearrange ++ = accents[k];
  2426         // forming all possible canonical rearrangement by dropping
  2427         // sets of accents
  2428         for (int i = 0; i <= size - 1; i ++) {
  2429             int32_t mask = 1 << (size - i - 1);
  2430             if (count & mask) {
  2431                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
  2432                     *rearrange ++ = accents[j];
  2436         *rearrange = 0;
  2437         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
  2438                                                           baseoffset, status);
  2439         if (offset != USEARCH_DONE) {
  2440             return TRUE; // match found
  2442         count --;
  2444     return FALSE;
  2447 /**
  2448 * Checks match for contraction.
  2449 * If the match starts with a partial contraction we fail.
  2450 * Internal method, status assumed to be success, caller has to check status
  2451 * before calling this method.
  2452 * @param strsrch string search data
  2453 * @param start offset of potential match, to be modified if necessary
  2454 * @param end offset of potential match, to be modified if necessary
  2455 * @param status only error status if any
  2456 * @return TRUE if match passes the contraction test, FALSE otherwise
  2457 */
  2458 static
  2459 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
  2460                                      int32_t   *start,
  2461                                      int32_t   *end, UErrorCode  *status)
  2463           UCollationElements *coleiter   = strsrch->textIter;
  2464           int32_t             textlength = strsrch->search->textLength;
  2465           int32_t         temp       = *end;
  2466     const UCollator          *collator   = strsrch->collator;
  2467     const UChar              *text       = strsrch->search->text;
  2468     // This part checks if either if the start of the match contains potential
  2469     // contraction. If so we'll have to iterate through them
  2470     // Since we used ucol_next while previously looking for the potential
  2471     // match, this guarantees that our end will not be a partial contraction,
  2472     // or a partial supplementary character.
  2473     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
  2474         int32_t expansion  = getExpansionSuffix(coleiter);
  2475         UBool   expandflag = expansion > 0;
  2476         setColEIterOffset(coleiter, *end);
  2477         while (expansion > 0) {
  2478             // getting rid of the redundant ce
  2479             // since forward contraction/expansion may have extra ces
  2480             // if we are in the normalization buffer, hasAccentsBeforeMatch
  2481             // would have taken care of it.
  2482             // E.g. the character \u01FA will have an expansion of 3, but if
  2483             // we are only looking for A ring A\u030A, we'll have to skip the
  2484             // last ce in the expansion buffer
  2485             ucol_previous(coleiter, status);
  2486             if (U_FAILURE(*status)) {
  2487                 return FALSE;
  2489             if (ucol_getOffset(coleiter) != temp) {
  2490                 *end = temp;
  2491                 temp  = ucol_getOffset(coleiter);
  2493             expansion --;
  2496         int32_t  *patternce       = strsrch->pattern.CE;
  2497         int32_t   patterncelength = strsrch->pattern.CELength;
  2498         int32_t   count           = patterncelength;
  2499         while (count > 0) {
  2500             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
  2501             // status checked below, note that if status is a failure
  2502             // ucol_previous returns UCOL_NULLORDER
  2503             if (ce == UCOL_IGNORABLE) {
  2504                 continue;
  2506             if (expandflag && count == 0 &&
  2507                 getColElemIterOffset(coleiter, FALSE) != temp) {
  2508                 *end = temp;
  2509                 temp  = ucol_getOffset(coleiter);
  2511             if (count == patterncelength &&
  2512                 ce != patternce[patterncelength - 1]) {
  2513                 // accents may have extra starting ces, this occurs when a
  2514                 // pure accent pattern is matched without rearrangement
  2515                 int32_t    expected = patternce[patterncelength - 1];
  2516                 U16_BACK_1(text, 0, *end);
  2517                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
  2518                     ce = getCE(strsrch, ucol_previous(coleiter, status));
  2519                     while (U_SUCCESS(*status) && ce != expected &&
  2520                            ce != UCOL_NULLORDER &&
  2521                            ucol_getOffset(coleiter) <= *start) {
  2522                         ce = getCE(strsrch, ucol_previous(coleiter, status));
  2526             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
  2527                 (*start) --;
  2528                 *start = getPreviousBaseOffset(text, *start);
  2529                 return FALSE;
  2531             count --;
  2534     return TRUE;
  2537 /**
  2538 * Checks and sets the match information if found.
  2539 * Checks
  2540 * <ul>
  2541 * <li> the potential match does not repeat the previous match
  2542 * <li> boundaries are correct
  2543 * <li> potential match does not end in the middle of a contraction
  2544 * <li> identical matches
  2545 * <\ul>
  2546 * Otherwise the offset will be shifted to the next character.
  2547 * Internal method, status assumed to be success, caller has to check status
  2548 * before calling this method.
  2549 * @param strsrch string search data
  2550 * @param textoffset offset in the collation element text. the returned value
  2551 *        will be the truncated start offset of the match or the new start
  2552 *        search offset.
  2553 * @param status only error status if any
  2554 * @return TRUE if the match is valid, FALSE otherwise
  2555 */
  2556 static
  2557 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
  2558                                          int32_t   *textoffset,
  2559                                          UErrorCode    *status)
  2561     // to ensure that the start and ends are not composite characters
  2562     UCollationElements *coleiter = strsrch->textIter;
  2563     // if we have a canonical accent match
  2564     if ((strsrch->pattern.hasSuffixAccents &&
  2565         strsrch->canonicalSuffixAccents[0]) ||
  2566         (strsrch->pattern.hasPrefixAccents &&
  2567         strsrch->canonicalPrefixAccents[0])) {
  2568         strsrch->search->matchedIndex  = *textoffset;
  2569         strsrch->search->matchedLength =
  2570             getNextUStringSearchBaseOffset(strsrch,
  2571                                       getColElemIterOffset(coleiter, FALSE))
  2572             - *textoffset;
  2573         return TRUE;
  2576     int32_t end = ucol_getOffset(coleiter);
  2577     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
  2578                                                 status) ||
  2579          U_FAILURE(*status)) {
  2580         return FALSE;
  2583     end = getNextUStringSearchBaseOffset(strsrch, end);
  2584     // this totally matches, however we need to check if it is repeating
  2585     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
  2586         !isBreakUnit(strsrch, *textoffset, end) ||
  2587         !checkIdentical(strsrch, *textoffset, end)) {
  2588         (*textoffset) --;
  2589         *textoffset = getPreviousBaseOffset(strsrch->search->text,
  2590                                             *textoffset);
  2591         return FALSE;
  2594     strsrch->search->matchedIndex  = *textoffset;
  2595     strsrch->search->matchedLength = end - *textoffset;
  2596     return TRUE;
  2598 #endif // #if BOYER_MOORE
  2600 // constructors and destructor -------------------------------------------
  2602 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
  2603                                           int32_t         patternlength,
  2604                                     const UChar          *text,
  2605                                           int32_t         textlength,
  2606                                     const char           *locale,
  2607                                           UBreakIterator *breakiter,
  2608                                           UErrorCode     *status)
  2610     if (U_FAILURE(*status)) {
  2611         return NULL;
  2613 #if UCONFIG_NO_BREAK_ITERATION
  2614     if (breakiter != NULL) {
  2615         *status = U_UNSUPPORTED_ERROR;
  2616         return NULL;
  2618 #endif
  2619     if (locale) {
  2620         // ucol_open internally checks for status
  2621         UCollator     *collator = ucol_open(locale, status);
  2622         // pattern, text checks are done in usearch_openFromCollator
  2623         UStringSearch *result   = usearch_openFromCollator(pattern,
  2624                                               patternlength, text, textlength,
  2625                                               collator, breakiter, status);
  2627         if (result == NULL || U_FAILURE(*status)) {
  2628             if (collator) {
  2629                 ucol_close(collator);
  2631             return NULL;
  2633         else {
  2634             result->ownCollator = TRUE;
  2636         return result;
  2638     *status = U_ILLEGAL_ARGUMENT_ERROR;
  2639     return NULL;
  2642 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
  2643                                   const UChar          *pattern,
  2644                                         int32_t         patternlength,
  2645                                   const UChar          *text,
  2646                                         int32_t         textlength,
  2647                                   const UCollator      *collator,
  2648                                         UBreakIterator *breakiter,
  2649                                         UErrorCode     *status)
  2651     if (U_FAILURE(*status)) {
  2652         return NULL;
  2654 #if UCONFIG_NO_BREAK_ITERATION
  2655     if (breakiter != NULL) {
  2656         *status = U_UNSUPPORTED_ERROR;
  2657         return NULL;
  2659 #endif
  2660     if (pattern == NULL || text == NULL || collator == NULL) {
  2661         *status = U_ILLEGAL_ARGUMENT_ERROR;
  2662         return NULL;
  2665     // string search does not really work when numeric collation is turned on
  2666     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
  2667         *status = U_UNSUPPORTED_ERROR;
  2668         return NULL;
  2671     if (U_SUCCESS(*status)) {
  2672         initializeFCD(status);
  2673         if (U_FAILURE(*status)) {
  2674             return NULL;
  2677         UStringSearch *result;
  2678         if (textlength == -1) {
  2679             textlength = u_strlen(text);
  2681         if (patternlength == -1) {
  2682             patternlength = u_strlen(pattern);
  2684         if (textlength <= 0 || patternlength <= 0) {
  2685             *status = U_ILLEGAL_ARGUMENT_ERROR;
  2686             return NULL;
  2689         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
  2690         if (result == NULL) {
  2691             *status = U_MEMORY_ALLOCATION_ERROR;
  2692             return NULL;
  2695         result->collator    = collator;
  2696         result->strength    = ucol_getStrength(collator);
  2697         result->ceMask      = getMask(result->strength);
  2698         result->toShift     =
  2699              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
  2700                                                             UCOL_SHIFTED;
  2701         result->variableTop = ucol_getVariableTop(collator, status);
  2703         result->nfd         = Normalizer2Factory::getNFDInstance(*status);
  2705         if (U_FAILURE(*status)) {
  2706             uprv_free(result);
  2707             return NULL;
  2710         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
  2711         if (result->search == NULL) {
  2712             *status = U_MEMORY_ALLOCATION_ERROR;
  2713             uprv_free(result);
  2714             return NULL;
  2717         result->search->text       = text;
  2718         result->search->textLength = textlength;
  2720         result->pattern.text       = pattern;
  2721         result->pattern.textLength = patternlength;
  2722         result->pattern.CE         = NULL;
  2723         result->pattern.PCE        = NULL;
  2725         result->search->breakIter  = breakiter;
  2726 #if !UCONFIG_NO_BREAK_ITERATION
  2727         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
  2728         if (breakiter) {
  2729             ubrk_setText(breakiter, text, textlength, status);
  2731 #endif
  2733         result->ownCollator           = FALSE;
  2734         result->search->matchedLength = 0;
  2735         result->search->matchedIndex  = USEARCH_DONE;
  2736         result->utilIter              = NULL;
  2737         result->textIter              = ucol_openElements(collator, text,
  2738                                                           textlength, status);
  2739         if (U_FAILURE(*status)) {
  2740             usearch_close(result);
  2741             return NULL;
  2744         result->search->isOverlap          = FALSE;
  2745         result->search->isCanonicalMatch   = FALSE;
  2746         result->search->elementComparisonType = 0;
  2747         result->search->isForwardSearching = TRUE;
  2748         result->search->reset              = TRUE;
  2750         initialize(result, status);
  2752         if (U_FAILURE(*status)) {
  2753             usearch_close(result);
  2754             return NULL;
  2757         return result;
  2759     return NULL;
  2762 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
  2764     if (strsrch) {
  2765         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
  2766             strsrch->pattern.CE) {
  2767             uprv_free(strsrch->pattern.CE);
  2770         if (strsrch->pattern.PCE != NULL &&
  2771             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
  2772             uprv_free(strsrch->pattern.PCE);
  2775         ucol_closeElements(strsrch->textIter);
  2776         ucol_closeElements(strsrch->utilIter);
  2778         if (strsrch->ownCollator && strsrch->collator) {
  2779             ucol_close((UCollator *)strsrch->collator);
  2782 #if !UCONFIG_NO_BREAK_ITERATION
  2783         if (strsrch->search->internalBreakIter) {
  2784             ubrk_close(strsrch->search->internalBreakIter);
  2786 #endif
  2788         uprv_free(strsrch->search);
  2789         uprv_free(strsrch);
  2793 // set and get methods --------------------------------------------------
  2795 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
  2796                                         int32_t    position,
  2797                                         UErrorCode    *status)
  2799     if (U_SUCCESS(*status) && strsrch) {
  2800         if (isOutOfBounds(strsrch->search->textLength, position)) {
  2801             *status = U_INDEX_OUTOFBOUNDS_ERROR;
  2803         else {
  2804             setColEIterOffset(strsrch->textIter, position);
  2806         strsrch->search->matchedIndex  = USEARCH_DONE;
  2807         strsrch->search->matchedLength = 0;
  2808         strsrch->search->reset         = FALSE;
  2812 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
  2814     if (strsrch) {
  2815         int32_t result = ucol_getOffset(strsrch->textIter);
  2816         if (isOutOfBounds(strsrch->search->textLength, result)) {
  2817             return USEARCH_DONE;
  2819         return result;
  2821     return USEARCH_DONE;
  2824 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
  2825                                  USearchAttribute attribute,
  2826                                  USearchAttributeValue value,
  2827                                  UErrorCode *status)
  2829     if (U_SUCCESS(*status) && strsrch) {
  2830         switch (attribute)
  2832         case USEARCH_OVERLAP :
  2833             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
  2834             break;
  2835         case USEARCH_CANONICAL_MATCH :
  2836             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
  2837                                                                       FALSE);
  2838             break;
  2839         case USEARCH_ELEMENT_COMPARISON :
  2840             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
  2841                 strsrch->search->elementComparisonType = (int16_t)value;
  2842             } else {
  2843                 strsrch->search->elementComparisonType = 0;
  2845             break;
  2846         case USEARCH_ATTRIBUTE_COUNT :
  2847         default:
  2848             *status = U_ILLEGAL_ARGUMENT_ERROR;
  2851     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
  2852         *status = U_ILLEGAL_ARGUMENT_ERROR;
  2856 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
  2857                                                 const UStringSearch *strsrch,
  2858                                                 USearchAttribute attribute)
  2860     if (strsrch) {
  2861         switch (attribute) {
  2862         case USEARCH_OVERLAP :
  2863             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
  2864                                                         USEARCH_OFF);
  2865         case USEARCH_CANONICAL_MATCH :
  2866             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
  2867                                                                USEARCH_OFF);
  2868         case USEARCH_ELEMENT_COMPARISON :
  2870                 int16_t value = strsrch->search->elementComparisonType;
  2871                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
  2872                     return (USearchAttributeValue)value;
  2873                 } else {
  2874                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
  2877         case USEARCH_ATTRIBUTE_COUNT :
  2878             return USEARCH_DEFAULT;
  2881     return USEARCH_DEFAULT;
  2884 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
  2885                                                 const UStringSearch *strsrch)
  2887     if (strsrch == NULL) {
  2888         return USEARCH_DONE;
  2890     return strsrch->search->matchedIndex;
  2894 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
  2895                                             UChar         *result,
  2896                                             int32_t        resultCapacity,
  2897                                             UErrorCode    *status)
  2899     if (U_FAILURE(*status)) {
  2900         return USEARCH_DONE;
  2902     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
  2903         result == NULL)) {
  2904         *status = U_ILLEGAL_ARGUMENT_ERROR;
  2905         return USEARCH_DONE;
  2908     int32_t     copylength = strsrch->search->matchedLength;
  2909     int32_t copyindex  = strsrch->search->matchedIndex;
  2910     if (copyindex == USEARCH_DONE) {
  2911         u_terminateUChars(result, resultCapacity, 0, status);
  2912         return USEARCH_DONE;
  2915     if (resultCapacity < copylength) {
  2916         copylength = resultCapacity;
  2918     if (copylength > 0) {
  2919         uprv_memcpy(result, strsrch->search->text + copyindex,
  2920                     copylength * sizeof(UChar));
  2922     return u_terminateUChars(result, resultCapacity,
  2923                              strsrch->search->matchedLength, status);
  2926 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
  2927                                               const UStringSearch *strsrch)
  2929     if (strsrch) {
  2930         return strsrch->search->matchedLength;
  2932     return USEARCH_DONE;
  2935 #if !UCONFIG_NO_BREAK_ITERATION
  2937 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
  2938                                                UBreakIterator *breakiter,
  2939                                                UErrorCode     *status)
  2941     if (U_SUCCESS(*status) && strsrch) {
  2942         strsrch->search->breakIter = breakiter;
  2943         if (breakiter) {
  2944             ubrk_setText(breakiter, strsrch->search->text,
  2945                          strsrch->search->textLength, status);
  2950 U_CAPI const UBreakIterator* U_EXPORT2
  2951 usearch_getBreakIterator(const UStringSearch *strsrch)
  2953     if (strsrch) {
  2954         return strsrch->search->breakIter;
  2956     return NULL;
  2959 #endif
  2961 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
  2962                                       const UChar         *text,
  2963                                             int32_t        textlength,
  2964                                             UErrorCode    *status)
  2966     if (U_SUCCESS(*status)) {
  2967         if (strsrch == NULL || text == NULL || textlength < -1 ||
  2968             textlength == 0) {
  2969             *status = U_ILLEGAL_ARGUMENT_ERROR;
  2971         else {
  2972             if (textlength == -1) {
  2973                 textlength = u_strlen(text);
  2975             strsrch->search->text       = text;
  2976             strsrch->search->textLength = textlength;
  2977             ucol_setText(strsrch->textIter, text, textlength, status);
  2978             strsrch->search->matchedIndex  = USEARCH_DONE;
  2979             strsrch->search->matchedLength = 0;
  2980             strsrch->search->reset         = TRUE;
  2981 #if !UCONFIG_NO_BREAK_ITERATION
  2982             if (strsrch->search->breakIter != NULL) {
  2983                 ubrk_setText(strsrch->search->breakIter, text,
  2984                              textlength, status);
  2986             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
  2987 #endif
  2992 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
  2993                                                      int32_t       *length)
  2995     if (strsrch) {
  2996         *length = strsrch->search->textLength;
  2997         return strsrch->search->text;
  2999     return NULL;
  3002 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
  3003                                           const UCollator     *collator,
  3004                                                 UErrorCode    *status)
  3006     if (U_SUCCESS(*status)) {
  3007         if (collator == NULL) {
  3008             *status = U_ILLEGAL_ARGUMENT_ERROR;
  3009             return;
  3012         if (strsrch) {
  3013             if (strsrch->ownCollator && (strsrch->collator != collator)) {
  3014                 ucol_close((UCollator *)strsrch->collator);
  3015                 strsrch->ownCollator = FALSE;
  3017             strsrch->collator    = collator;
  3018             strsrch->strength    = ucol_getStrength(collator);
  3019             strsrch->ceMask      = getMask(strsrch->strength);
  3020 #if !UCONFIG_NO_BREAK_ITERATION
  3021             ubrk_close(strsrch->search->internalBreakIter);
  3022             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
  3023                                                      strsrch->search->text, strsrch->search->textLength, status);
  3024 #endif
  3025             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
  3026             strsrch->toShift     =
  3027                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
  3028                                                                 UCOL_SHIFTED;
  3029             // if status is a failure, ucol_getVariableTop returns 0
  3030             strsrch->variableTop = ucol_getVariableTop(collator, status);
  3031             if (U_SUCCESS(*status)) {
  3032                 initialize(strsrch, status);
  3033                 if (U_SUCCESS(*status)) {
  3034                     /* free offset buffer to avoid memory leak before initializing. */
  3035                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
  3036                     uprv_init_collIterate(collator, strsrch->search->text,
  3037                                           strsrch->search->textLength,
  3038                                           &(strsrch->textIter->iteratordata_),
  3039                                           status);
  3040                     strsrch->utilIter->iteratordata_.coll = collator;
  3045         // **** are these calls needed?
  3046         // **** we call uprv_init_pce in initializePatternPCETable
  3047         // **** and the CEBuffer constructor...
  3048 #if 0
  3049         uprv_init_pce(strsrch->textIter);
  3050         uprv_init_pce(strsrch->utilIter);
  3051 #endif
  3055 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
  3057     if (strsrch) {
  3058         return (UCollator *)strsrch->collator;
  3060     return NULL;
  3063 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
  3064                                          const UChar         *pattern,
  3065                                                int32_t        patternlength,
  3066                                                UErrorCode    *status)
  3068     if (U_SUCCESS(*status)) {
  3069         if (strsrch == NULL || pattern == NULL) {
  3070             *status = U_ILLEGAL_ARGUMENT_ERROR;
  3072         else {
  3073             if (patternlength == -1) {
  3074                 patternlength = u_strlen(pattern);
  3076             if (patternlength == 0) {
  3077                 *status = U_ILLEGAL_ARGUMENT_ERROR;
  3078                 return;
  3080             strsrch->pattern.text       = pattern;
  3081             strsrch->pattern.textLength = patternlength;
  3082             initialize(strsrch, status);
  3087 U_CAPI const UChar* U_EXPORT2
  3088 usearch_getPattern(const UStringSearch *strsrch,
  3089                    int32_t       *length)
  3091     if (strsrch) {
  3092         *length = strsrch->pattern.textLength;
  3093         return strsrch->pattern.text;
  3095     return NULL;
  3098 // miscellanous methods --------------------------------------------------
  3100 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
  3101                                            UErrorCode    *status)
  3103     if (strsrch && U_SUCCESS(*status)) {
  3104         strsrch->search->isForwardSearching = TRUE;
  3105         usearch_setOffset(strsrch, 0, status);
  3106         if (U_SUCCESS(*status)) {
  3107             return usearch_next(strsrch, status);
  3110     return USEARCH_DONE;
  3113 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
  3114                                                int32_t    position,
  3115                                                UErrorCode    *status)
  3117     if (strsrch && U_SUCCESS(*status)) {
  3118         strsrch->search->isForwardSearching = TRUE;
  3119         // position checked in usearch_setOffset
  3120         usearch_setOffset(strsrch, position, status);
  3121         if (U_SUCCESS(*status)) {
  3122             return usearch_next(strsrch, status);
  3125     return USEARCH_DONE;
  3128 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
  3129                                           UErrorCode    *status)
  3131     if (strsrch && U_SUCCESS(*status)) {
  3132         strsrch->search->isForwardSearching = FALSE;
  3133         usearch_setOffset(strsrch, strsrch->search->textLength, status);
  3134         if (U_SUCCESS(*status)) {
  3135             return usearch_previous(strsrch, status);
  3138     return USEARCH_DONE;
  3141 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
  3142                                                int32_t    position,
  3143                                                UErrorCode    *status)
  3145     if (strsrch && U_SUCCESS(*status)) {
  3146         strsrch->search->isForwardSearching = FALSE;
  3147         // position checked in usearch_setOffset
  3148         usearch_setOffset(strsrch, position, status);
  3149         if (U_SUCCESS(*status)) {
  3150             return usearch_previous(strsrch, status);
  3153     return USEARCH_DONE;
  3156 /**
  3157 * If a direction switch is required, we'll count the number of ces till the
  3158 * beginning of the collation element iterator and iterate forwards that
  3159 * number of times. This is so that we get to the correct point within the
  3160 * string to continue the search in. Imagine when we are in the middle of the
  3161 * normalization buffer when the change in direction is request. arrrgghh....
  3162 * After searching the offset within the collation element iterator will be
  3163 * shifted to the start of the match. If a match is not found, the offset would
  3164 * have been set to the end of the text string in the collation element
  3165 * iterator.
  3166 * Okay, here's my take on normalization buffer. The only time when there can
  3167 * be 2 matches within the same normalization is when the pattern is consists
  3168 * of all accents. But since the offset returned is from the text string, we
  3169 * should not confuse the caller by returning the second match within the
  3170 * same normalization buffer. If we do, the 2 results will have the same match
  3171 * offsets, and that'll be confusing. I'll return the next match that doesn't
  3172 * fall within the same normalization buffer. Note this does not affect the
  3173 * results of matches spanning the text and the normalization buffer.
  3174 * The position to start searching is taken from the collation element
  3175 * iterator. Callers of this API would have to set the offset in the collation
  3176 * element iterator before using this method.
  3177 */
  3178 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
  3179                                           UErrorCode    *status)
  3181     if (U_SUCCESS(*status) && strsrch) {
  3182         // note offset is either equivalent to the start of the previous match
  3183         // or is set by the user
  3184         int32_t      offset       = usearch_getOffset(strsrch);
  3185         USearch     *search       = strsrch->search;
  3186         search->reset             = FALSE;
  3187         int32_t      textlength   = search->textLength;
  3188         if (search->isForwardSearching) {
  3189 #if BOYER_MOORE
  3190             if (offset == textlength
  3191                 || (!search->isOverlap &&
  3192                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
  3193                     (search->matchedIndex != USEARCH_DONE &&
  3194                      offset + search->matchedLength >= textlength)))) {
  3195                 // not enough characters to match
  3196                 setMatchNotFound(strsrch);
  3197                 return USEARCH_DONE;
  3199 #else
  3200             if (offset == textlength ||
  3201                 (! search->isOverlap &&
  3202                 (search->matchedIndex != USEARCH_DONE &&
  3203                 offset + search->matchedLength > textlength))) {
  3204                     // not enough characters to match
  3205                     setMatchNotFound(strsrch);
  3206                     return USEARCH_DONE;
  3208 #endif
  3210         else {
  3211             // switching direction.
  3212             // if matchedIndex == USEARCH_DONE, it means that either a
  3213             // setOffset has been called or that previous ran off the text
  3214             // string. the iterator would have been set to offset 0 if a
  3215             // match is not found.
  3216             search->isForwardSearching = TRUE;
  3217             if (search->matchedIndex != USEARCH_DONE) {
  3218                 // there's no need to set the collation element iterator
  3219                 // the next call to next will set the offset.
  3220                 return search->matchedIndex;
  3224         if (U_SUCCESS(*status)) {
  3225             if (strsrch->pattern.CELength == 0) {
  3226                 if (search->matchedIndex == USEARCH_DONE) {
  3227                     search->matchedIndex = offset;
  3229                 else { // moves by codepoints
  3230                     U16_FWD_1(search->text, search->matchedIndex, textlength);
  3233                 search->matchedLength = 0;
  3234                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
  3235                 // status checked below
  3236                 if (search->matchedIndex == textlength) {
  3237                     search->matchedIndex = USEARCH_DONE;
  3240             else {
  3241                 if (search->matchedLength > 0) {
  3242                     // if matchlength is 0 we are at the start of the iteration
  3243                     if (search->isOverlap) {
  3244                         ucol_setOffset(strsrch->textIter, offset + 1, status);
  3246                     else {
  3247                         ucol_setOffset(strsrch->textIter,
  3248                                        offset + search->matchedLength, status);
  3251                 else {
  3252                     // for boundary check purposes. this will ensure that the
  3253                     // next match will not preceed the current offset
  3254                     // note search->matchedIndex will always be set to something
  3255                     // in the code
  3256                     search->matchedIndex = offset - 1;
  3259                 if (search->isCanonicalMatch) {
  3260                     // can't use exact here since extra accents are allowed.
  3261                     usearch_handleNextCanonical(strsrch, status);
  3263                 else {
  3264                     usearch_handleNextExact(strsrch, status);
  3268             if (U_FAILURE(*status)) {
  3269                 return USEARCH_DONE;
  3272 #if !BOYER_MOORE
  3273             if (search->matchedIndex == USEARCH_DONE) {
  3274                 ucol_setOffset(strsrch->textIter, search->textLength, status);
  3275             } else {
  3276                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
  3278 #endif
  3280             return search->matchedIndex;
  3283     return USEARCH_DONE;
  3286 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
  3287                                               UErrorCode *status)
  3289     if (U_SUCCESS(*status) && strsrch) {
  3290         int32_t offset;
  3291         USearch *search = strsrch->search;
  3292         if (search->reset) {
  3293             offset                     = search->textLength;
  3294             search->isForwardSearching = FALSE;
  3295             search->reset              = FALSE;
  3296             setColEIterOffset(strsrch->textIter, offset);
  3298         else {
  3299             offset = usearch_getOffset(strsrch);
  3302         int32_t matchedindex = search->matchedIndex;
  3303         if (search->isForwardSearching == TRUE) {
  3304             // switching direction.
  3305             // if matchedIndex == USEARCH_DONE, it means that either a
  3306             // setOffset has been called or that next ran off the text
  3307             // string. the iterator would have been set to offset textLength if
  3308             // a match is not found.
  3309             search->isForwardSearching = FALSE;
  3310             if (matchedindex != USEARCH_DONE) {
  3311                 return matchedindex;
  3314         else {
  3315 #if BOYER_MOORE
  3316             if (offset == 0 || matchedindex == 0 ||
  3317                 (!search->isOverlap &&
  3318                     (offset < strsrch->pattern.defaultShiftSize ||
  3319                     (matchedindex != USEARCH_DONE &&
  3320                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
  3321                 // not enough characters to match
  3322                 setMatchNotFound(strsrch);
  3323                 return USEARCH_DONE;
  3325 #else
  3326             // Could check pattern length, but the
  3327             // linear search will do the right thing
  3328             if (offset == 0 || matchedindex == 0) {
  3329                 setMatchNotFound(strsrch);
  3330                 return USEARCH_DONE;
  3332 #endif
  3335         if (U_SUCCESS(*status)) {
  3336             if (strsrch->pattern.CELength == 0) {
  3337                 search->matchedIndex =
  3338                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
  3339                 if (search->matchedIndex == 0) {
  3340                     setMatchNotFound(strsrch);
  3341                     // status checked below
  3343                 else { // move by codepoints
  3344                     U16_BACK_1(search->text, 0, search->matchedIndex);
  3345                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
  3346                     // status checked below
  3347                     search->matchedLength = 0;
  3350             else {
  3351                 if (strsrch->search->isCanonicalMatch) {
  3352                     // can't use exact here since extra accents are allowed.
  3353                     usearch_handlePreviousCanonical(strsrch, status);
  3354                     // status checked below
  3356                 else {
  3357                     usearch_handlePreviousExact(strsrch, status);
  3358                     // status checked below
  3362             if (U_FAILURE(*status)) {
  3363                 return USEARCH_DONE;
  3366             return search->matchedIndex;
  3369     return USEARCH_DONE;
  3374 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
  3376     /*
  3377     reset is setting the attributes that are already in
  3378     string search, hence all attributes in the collator should
  3379     be retrieved without any problems
  3380     */
  3381     if (strsrch) {
  3382         UErrorCode status            = U_ZERO_ERROR;
  3383         UBool      sameCollAttribute = TRUE;
  3384         uint32_t   ceMask;
  3385         UBool      shift;
  3386         uint32_t   varTop;
  3388         // **** hack to deal w/ how processed CEs encode quaternary ****
  3389         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
  3390         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
  3391             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
  3392                 sameCollAttribute = FALSE;
  3395         strsrch->strength    = ucol_getStrength(strsrch->collator);
  3396         ceMask = getMask(strsrch->strength);
  3397         if (strsrch->ceMask != ceMask) {
  3398             strsrch->ceMask = ceMask;
  3399             sameCollAttribute = FALSE;
  3402         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
  3403         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
  3404                                   &status) == UCOL_SHIFTED;
  3405         if (strsrch->toShift != shift) {
  3406             strsrch->toShift  = shift;
  3407             sameCollAttribute = FALSE;
  3410         // if status is a failure, ucol_getVariableTop returns 0
  3411         varTop = ucol_getVariableTop(strsrch->collator, &status);
  3412         if (strsrch->variableTop != varTop) {
  3413             strsrch->variableTop = varTop;
  3414             sameCollAttribute    = FALSE;
  3416         if (!sameCollAttribute) {
  3417             initialize(strsrch, &status);
  3419         /* free offset buffer to avoid memory leak before initializing. */
  3420         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
  3421         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
  3422                               strsrch->search->textLength,
  3423                               &(strsrch->textIter->iteratordata_),
  3424                               &status);
  3425         strsrch->search->matchedLength      = 0;
  3426         strsrch->search->matchedIndex       = USEARCH_DONE;
  3427         strsrch->search->isOverlap          = FALSE;
  3428         strsrch->search->isCanonicalMatch   = FALSE;
  3429         strsrch->search->elementComparisonType = 0;
  3430         strsrch->search->isForwardSearching = TRUE;
  3431         strsrch->search->reset              = TRUE;
  3435 //
  3436 //  CEI  Collation Element + source text index.
  3437 //       These structs are kept in the circular buffer.
  3438 //
  3439 struct  CEI {
  3440     int64_t ce;
  3441     int32_t lowIndex;
  3442     int32_t highIndex;
  3443 };
  3445 U_NAMESPACE_BEGIN
  3448 //
  3449 //  CEBuffer   A circular buffer of CEs from the text being searched.
  3450 //
  3451 #define   DEFAULT_CEBUFFER_SIZE 96
  3452 #define   CEBUFFER_EXTRA 32
  3453 // Some typical max values to make buffer size more reasonable for asymmetric search.
  3454 // #8694 is for a better long-term solution to allocation of this buffer.
  3455 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
  3456 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
  3457 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
  3458 struct CEBuffer {
  3459     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
  3460     CEI                 *buf;
  3461     int32_t              bufSize;
  3462     int32_t              firstIx;
  3463     int32_t              limitIx;
  3464     UCollationElements  *ceIter;
  3465     UStringSearch       *strSearch;
  3469                CEBuffer(UStringSearch *ss, UErrorCode *status);
  3470                ~CEBuffer();
  3471    const CEI   *get(int32_t index);
  3472    const CEI   *getPrevious(int32_t index);
  3473 };
  3476 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
  3477     buf = defBuf;
  3478     strSearch = ss;
  3479     bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;
  3480     if (ss->search->elementComparisonType != 0) {
  3481         const UChar * patText = ss->pattern.text;
  3482         if (patText) {
  3483             const UChar * patTextLimit = patText + ss->pattern.textLength;
  3484             while ( patText < patTextLimit ) {
  3485                 UChar c = *patText++;
  3486                 if (MIGHT_BE_JAMO_L(c)) {
  3487                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
  3488                 } else {
  3489                     // No check for surrogates, we might allocate slightly more buffer than necessary.
  3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
  3495     ceIter    = ss->textIter;
  3496     firstIx = 0;
  3497     limitIx = 0;
  3499     uprv_init_pce(ceIter);
  3501     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
  3502         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
  3503         if (buf == NULL) {
  3504             *status = U_MEMORY_ALLOCATION_ERROR;
  3509 // TODO: add a reset or init function so that allocated
  3510 //       buffers can be retained & reused.
  3512 CEBuffer::~CEBuffer() {
  3513     if (buf != defBuf) {
  3514         uprv_free(buf);
  3519 // Get the CE with the specified index.
  3520 //   Index must be in the range
  3521 //          n-history_size < index < n+1
  3522 //   where n is the largest index to have been fetched by some previous call to this function.
  3523 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
  3524 //
  3525 const CEI *CEBuffer::get(int32_t index) {
  3526     int i = index % bufSize;
  3528     if (index>=firstIx && index<limitIx) {
  3529         // The request was for an entry already in our buffer.
  3530         //  Just return it.
  3531         return &buf[i];
  3534     // Caller is requesting a new, never accessed before, CE.
  3535     //   Verify that it is the next one in sequence, which is all
  3536     //   that is allowed.
  3537     if (index != limitIx) {
  3538         U_ASSERT(FALSE);
  3540         return NULL;
  3543     // Manage the circular CE buffer indexing
  3544     limitIx++;
  3546     if (limitIx - firstIx >= bufSize) {
  3547         // The buffer is full, knock out the lowest-indexed entry.
  3548         firstIx++;
  3551     UErrorCode status = U_ZERO_ERROR;
  3553     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
  3555     return &buf[i];
  3558 // Get the CE with the specified index.
  3559 //   Index must be in the range
  3560 //          n-history_size < index < n+1
  3561 //   where n is the largest index to have been fetched by some previous call to this function.
  3562 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
  3563 //
  3564 const CEI *CEBuffer::getPrevious(int32_t index) {
  3565     int i = index % bufSize;
  3567     if (index>=firstIx && index<limitIx) {
  3568         // The request was for an entry already in our buffer.
  3569         //  Just return it.
  3570         return &buf[i];
  3573     // Caller is requesting a new, never accessed before, CE.
  3574     //   Verify that it is the next one in sequence, which is all
  3575     //   that is allowed.
  3576     if (index != limitIx) {
  3577         U_ASSERT(FALSE);
  3579         return NULL;
  3582     // Manage the circular CE buffer indexing
  3583     limitIx++;
  3585     if (limitIx - firstIx >= bufSize) {
  3586         // The buffer is full, knock out the lowest-indexed entry.
  3587         firstIx++;
  3590     UErrorCode status = U_ZERO_ERROR;
  3592     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
  3594     return &buf[i];
  3597 U_NAMESPACE_END
  3600 // #define USEARCH_DEBUG
  3602 #ifdef USEARCH_DEBUG
  3603 #include <stdio.h>
  3604 #include <stdlib.h>
  3605 #endif
  3607 /*
  3608  * Find the next break boundary after startIndex. If the UStringSearch object
  3609  * has an external break iterator, use that. Otherwise use the internal character
  3610  * break iterator.
  3611  */
  3612 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
  3613 #if 0
  3614     const UChar *text = strsrch->search->text;
  3615     int32_t textLen   = strsrch->search->textLength;
  3617     U_ASSERT(startIndex>=0);
  3618     U_ASSERT(startIndex<=textLen);
  3620     if (startIndex >= textLen) {
  3621         return startIndex;
  3624     UChar32  c;
  3625     int32_t  i = startIndex;
  3626     U16_NEXT(text, i, textLen, c);
  3628     // If we are on a control character, stop without looking for combining marks.
  3629     //    Control characters do not combine.
  3630     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
  3631     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
  3632         return i;
  3635     // The initial character was not a control, and can thus accept trailing
  3636     //   combining characters.  Advance over however many of them there are.
  3637     int32_t  indexOfLastCharChecked;
  3638     for (;;) {
  3639         indexOfLastCharChecked = i;
  3640         if (i>=textLen) {
  3641             break;
  3643         U16_NEXT(text, i, textLen, c);
  3644         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
  3645         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
  3646             break;
  3649     return indexOfLastCharChecked;
  3650 #elif !UCONFIG_NO_BREAK_ITERATION
  3651     UBreakIterator *breakiterator = strsrch->search->breakIter;
  3653     if (breakiterator == NULL) {
  3654         breakiterator = strsrch->search->internalBreakIter;
  3657     if (breakiterator != NULL) {
  3658         return ubrk_following(breakiterator, startIndex);
  3661     return startIndex;
  3662 #else
  3663     // **** or should we use the original code? ****
  3664     return startIndex;
  3665 #endif
  3669 /*
  3670  * Returns TRUE if index is on a break boundary. If the UStringSearch
  3671  * has an external break iterator, test using that, otherwise test
  3672  * using the internal character break iterator.
  3673  */
  3674 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
  3675 #if 0
  3676     const UChar *text = strsrch->search->text;
  3677     int32_t textLen   = strsrch->search->textLength;
  3679     U_ASSERT(index>=0);
  3680     U_ASSERT(index<=textLen);
  3682     if (index>=textLen || index<=0) {
  3683         return TRUE;
  3686     // If the character at the current index is not a GRAPHEME_EXTEND
  3687     //    then we can not be within a combining sequence.
  3688     UChar32  c;
  3689     U16_GET(text, 0, index, textLen, c);
  3690     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
  3691     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
  3692         return TRUE;
  3695     // We are at a combining mark.  If the preceding character is anything
  3696     //   except a CONTROL, CR or LF, we are in a combining sequence.
  3697     U16_PREV(text, 0, index, c);
  3698     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
  3699     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
  3700     return !combining;
  3701 #elif !UCONFIG_NO_BREAK_ITERATION
  3702     UBreakIterator *breakiterator = strsrch->search->breakIter;
  3704     if (breakiterator == NULL) {
  3705         breakiterator = strsrch->search->internalBreakIter;
  3708     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
  3709 #else
  3710     // **** or use the original code? ****
  3711     return TRUE;
  3712 #endif
  3715 #if 0
  3716 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
  3718 #if !UCONFIG_NO_BREAK_ITERATION
  3719     UBreakIterator *breakiterator = strsrch->search->breakIter;
  3721     if (breakiterator != NULL) {
  3722         int32_t startindex = ubrk_first(breakiterator);
  3723         int32_t endindex   = ubrk_last(breakiterator);
  3725         // out-of-range indexes are never boundary positions
  3726         if (start < startindex || start > endindex ||
  3727             end < startindex || end > endindex) {
  3728             return FALSE;
  3731         return ubrk_isBoundary(breakiterator, start) &&
  3732                ubrk_isBoundary(breakiterator, end);
  3734 #endif
  3736     return TRUE;
  3738 #endif
  3740 typedef enum {
  3741     U_CE_MATCH = -1,
  3742     U_CE_NO_MATCH = 0,
  3743     U_CE_SKIP_TARG,
  3744     U_CE_SKIP_PATN
  3745 } UCompareCEsResult;
  3746 #define U_CE_LEVEL2_BASE 0x00000005
  3747 #define U_CE_LEVEL3_BASE 0x00050000
  3749 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
  3750     if (targCE == patCE) {
  3751         return U_CE_MATCH;
  3753     if (compareType == 0) {
  3754         return U_CE_NO_MATCH;
  3757     int64_t targCEshifted = targCE >> 32;
  3758     int64_t patCEshifted = patCE >> 32;
  3759     int64_t mask;
  3761     mask = 0xFFFF0000;
  3762     int32_t targLev1 = (int32_t)(targCEshifted & mask);
  3763     int32_t patLev1 = (int32_t)(patCEshifted & mask);
  3764     if ( targLev1 != patLev1 ) {
  3765         if ( targLev1 == 0 ) {
  3766             return U_CE_SKIP_TARG;
  3768         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
  3769             return U_CE_SKIP_PATN;
  3771         return U_CE_NO_MATCH;
  3774     mask = 0x0000FFFF;
  3775     int32_t targLev2 = (int32_t)(targCEshifted & mask);
  3776     int32_t patLev2 = (int32_t)(patCEshifted & mask);
  3777     if ( targLev2 != patLev2 ) {
  3778         if ( targLev2 == 0 ) {
  3779             return U_CE_SKIP_TARG;
  3781         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
  3782             return U_CE_SKIP_PATN;
  3784         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
  3785             U_CE_MATCH: U_CE_NO_MATCH;
  3788     mask = 0xFFFF0000;
  3789     int32_t targLev3 = (int32_t)(targCE & mask);
  3790     int32_t patLev3 = (int32_t)(patCE & mask);
  3791     if ( targLev3 != patLev3 ) {
  3792         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
  3793             U_CE_MATCH: U_CE_NO_MATCH;
  3796     return U_CE_MATCH;
  3799 #if BOYER_MOORE
  3800 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
  3801 #endif
  3803 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
  3804                                        int32_t        startIdx,
  3805                                        int32_t        *matchStart,
  3806                                        int32_t        *matchLimit,
  3807                                        UErrorCode     *status)
  3809     if (U_FAILURE(*status)) {
  3810         return FALSE;
  3813     // TODO:  reject search patterns beginning with a combining char.
  3815 #ifdef USEARCH_DEBUG
  3816     if (getenv("USEARCH_DEBUG") != NULL) {
  3817         printf("Pattern CEs\n");
  3818         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
  3819             printf(" %8x", strsrch->pattern.CE[ii]);
  3821         printf("\n");
  3824 #endif
  3825     // Input parameter sanity check.
  3826     //  TODO:  should input indicies clip to the text length
  3827     //         in the same way that UText does.
  3828     if(strsrch->pattern.CELength == 0         ||
  3829        startIdx < 0                           ||
  3830        startIdx > strsrch->search->textLength ||
  3831        strsrch->pattern.CE == NULL) {
  3832            *status = U_ILLEGAL_ARGUMENT_ERROR;
  3833            return FALSE;
  3836     if (strsrch->pattern.PCE == NULL) {
  3837         initializePatternPCETable(strsrch, status);
  3840     ucol_setOffset(strsrch->textIter, startIdx, status);
  3841     CEBuffer ceb(strsrch, status);
  3844     int32_t    targetIx = 0;
  3845     const CEI *targetCEI = NULL;
  3846     int32_t    patIx;
  3847     UBool      found;
  3849     int32_t  mStart = -1;
  3850     int32_t  mLimit = -1;
  3851     int32_t  minLimit;
  3852     int32_t  maxLimit;
  3856     // Outer loop moves over match starting positions in the
  3857     //      target CE space.
  3858     // Here we see the target as a sequence of collation elements, resulting from the following:
  3859     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
  3860     //    (for example, digraphs such as IJ may be broken into two characters).
  3861     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
  3862     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
  3863     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
  3864     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
  3865     //    then the CE is deleted, so the following code sees only CEs that are relevant.
  3866     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
  3867     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
  3868     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
  3869     //
  3870     for(targetIx=0; ; targetIx++)
  3872         found = TRUE;
  3873         //  Inner loop checks for a match beginning at each
  3874         //  position from the outer loop.
  3875         int32_t targetIxOffset = 0;
  3876         int64_t patCE = 0;
  3877         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
  3878         // (compared to the last CE fetched for the previous targetIx value) as we need to go
  3879         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
  3880         const CEI *firstCEI = ceb.get(targetIx);
  3881         if (firstCEI == NULL) {
  3882             *status = U_INTERNAL_PROGRAM_ERROR;
  3883             found = FALSE;
  3884             break;
  3887         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
  3888             patCE = strsrch->pattern.PCE[patIx];
  3889             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
  3890             //  Compare CE from target string with CE from the pattern.
  3891             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
  3892             //    which will fail the compare, below.
  3893             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
  3894             if ( ceMatch == U_CE_NO_MATCH ) {
  3895                 found = FALSE;
  3896                 break;
  3897             } else if ( ceMatch > U_CE_NO_MATCH ) {
  3898                 if ( ceMatch == U_CE_SKIP_TARG ) {
  3899                     // redo with same patCE, next targCE
  3900                     patIx--;
  3901                     targetIxOffset++;
  3902                 } else { // ceMatch == U_CE_SKIP_PATN
  3903                     // redo with same targCE, next patCE
  3904                     targetIxOffset--;
  3908         targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
  3910         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
  3911             // No match at this targetIx.  Try again at the next.
  3912             continue;
  3915         if (!found) {
  3916             // No match at all, we have run off the end of the target text.
  3917             break;
  3921         // We have found a match in CE space.
  3922         // Now determine the bounds in string index space.
  3923         //  There still is a chance of match failure if the CE range not correspond to
  3924         //     an acceptable character range.
  3925         //
  3926         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
  3928         mStart   = firstCEI->lowIndex;
  3929         minLimit = lastCEI->lowIndex;
  3931         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
  3932         //   extended to the end of input, and the match is good.
  3934         // Look at the high and low indices of the CE following the match. If
  3935         // they are the same it means one of two things:
  3936         //    1. The match extended to the last CE from the target text, which is OK, or
  3937         //    2. The last CE that was part of the match is in an expansion that extends
  3938         //       to the first CE after the match. In this case, we reject the match.
  3939         const CEI *nextCEI = 0;
  3940         if (strsrch->search->elementComparisonType == 0) {
  3941             nextCEI  = ceb.get(targetIx + targetIxOffset);
  3942             maxLimit = nextCEI->lowIndex;
  3943             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
  3944                 found = FALSE;
  3946         } else {
  3947             for ( ; ; ++targetIxOffset ) {
  3948                 nextCEI = ceb.get(targetIx + targetIxOffset);
  3949                 maxLimit = nextCEI->lowIndex;
  3950                 // If we are at the end of the target too, match succeeds
  3951                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
  3952                     break;
  3954                 // As long as the next CE has primary weight of 0,
  3955                 // it is part of the last target element matched by the pattern;
  3956                 // make sure it can be part of a match with the last patCE
  3957                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
  3958                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
  3959                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
  3960                         found = FALSE;
  3961                         break;
  3963                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
  3964                 // target element, but it has non-zero primary weight => match fails
  3965                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
  3966                     found = false;
  3967                     break;
  3968                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
  3969                 } else {
  3970                     break;
  3976         // Check for the start of the match being within a combining sequence.
  3977         //   This can happen if the pattern itself begins with a combining char, and
  3978         //   the match found combining marks in the target text that were attached
  3979         //    to something else.
  3980         //   This type of match should be rejected for not completely consuming a
  3981         //   combining sequence.
  3982         if (!isBreakBoundary(strsrch, mStart)) {
  3983             found = FALSE;
  3986         // Check for the start of the match being within an Collation Element Expansion,
  3987         //   meaning that the first char of the match is only partially matched.
  3988         //   With exapnsions, the first CE will report the index of the source
  3989         //   character, and all subsequent (expansions) CEs will report the source index of the
  3990         //    _following_ character.
  3991         int32_t secondIx = firstCEI->highIndex;
  3992         if (mStart == secondIx) {
  3993             found = FALSE;
  3996         //  Advance the match end position to the first acceptable match boundary.
  3997         //    This advances the index over any combining charcters.
  3998         mLimit = maxLimit;
  3999         if (minLimit < maxLimit) {
  4000             // When the last CE's low index is same with its high index, the CE is likely
  4001             // a part of expansion. In this case, the index is located just after the
  4002             // character corresponding to the CEs compared above. If the index is right
  4003             // at the break boundary, move the position to the next boundary will result
  4004             // incorrect match length when there are ignorable characters exist between
  4005             // the position and the next character produces CE(s). See ticket#8482.
  4006             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
  4007                 mLimit = minLimit;
  4008             } else {
  4009                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
  4010                 if (nba >= lastCEI->highIndex) {
  4011                     mLimit = nba;
  4016     #ifdef USEARCH_DEBUG
  4017         if (getenv("USEARCH_DEBUG") != NULL) {
  4018             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
  4020     #endif
  4022         // If advancing to the end of a combining sequence in character indexing space
  4023         //   advanced us beyond the end of the match in CE space, reject this match.
  4024         if (mLimit > maxLimit) {
  4025             found = FALSE;
  4028         if (!isBreakBoundary(strsrch, mLimit)) {
  4029             found = FALSE;
  4032         if (! checkIdentical(strsrch, mStart, mLimit)) {
  4033             found = FALSE;
  4036         if (found) {
  4037             break;
  4041     #ifdef USEARCH_DEBUG
  4042     if (getenv("USEARCH_DEBUG") != NULL) {
  4043         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
  4044         int32_t  lastToPrint = ceb.limitIx+2;
  4045         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
  4046             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
  4048         printf("\n%s\n", found? "match found" : "no match");
  4050     #endif
  4052     // All Done.  Store back the match bounds to the caller.
  4053     //
  4054     if (found==FALSE) {
  4055         mLimit = -1;
  4056         mStart = -1;
  4059     if (matchStart != NULL) {
  4060         *matchStart= mStart;
  4063     if (matchLimit != NULL) {
  4064         *matchLimit = mLimit;
  4067     return found;
  4070 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
  4071                                                 int32_t        startIdx,
  4072                                                 int32_t        *matchStart,
  4073                                                 int32_t        *matchLimit,
  4074                                                 UErrorCode     *status)
  4076     if (U_FAILURE(*status)) {
  4077         return FALSE;
  4080     // TODO:  reject search patterns beginning with a combining char.
  4082 #ifdef USEARCH_DEBUG
  4083     if (getenv("USEARCH_DEBUG") != NULL) {
  4084         printf("Pattern CEs\n");
  4085         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
  4086             printf(" %8x", strsrch->pattern.CE[ii]);
  4088         printf("\n");
  4091 #endif
  4092     // Input parameter sanity check.
  4093     //  TODO:  should input indicies clip to the text length
  4094     //         in the same way that UText does.
  4095     if(strsrch->pattern.CELength == 0         ||
  4096        startIdx < 0                           ||
  4097        startIdx > strsrch->search->textLength ||
  4098        strsrch->pattern.CE == NULL) {
  4099            *status = U_ILLEGAL_ARGUMENT_ERROR;
  4100            return FALSE;
  4103     if (strsrch->pattern.PCE == NULL) {
  4104         initializePatternPCETable(strsrch, status);
  4107     CEBuffer ceb(strsrch, status);
  4108     int32_t    targetIx = 0;
  4110     /*
  4111      * Pre-load the buffer with the CE's for the grapheme
  4112      * after our starting position so that we're sure that
  4113      * we can look at the CE following the match when we
  4114      * check the match boundaries.
  4116      * This will also pre-fetch the first CE that we'll
  4117      * consider for the match.
  4118      */
  4119     if (startIdx < strsrch->search->textLength) {
  4120         UBreakIterator *bi = strsrch->search->internalBreakIter;
  4121         int32_t next = ubrk_following(bi, startIdx);
  4123         ucol_setOffset(strsrch->textIter, next, status);
  4125         for (targetIx = 0; ; targetIx += 1) {
  4126             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
  4127                 break;
  4130     } else {
  4131         ucol_setOffset(strsrch->textIter, startIdx, status);
  4135     const CEI *targetCEI = NULL;
  4136     int32_t    patIx;
  4137     UBool      found;
  4139     int32_t  limitIx = targetIx;
  4140     int32_t  mStart = -1;
  4141     int32_t  mLimit = -1;
  4142     int32_t  minLimit;
  4143     int32_t  maxLimit;
  4147     // Outer loop moves over match starting positions in the
  4148     //      target CE space.
  4149     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
  4150     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
  4151     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
  4152     // and the beginning of the base text.
  4153     for(targetIx = limitIx; ; targetIx += 1)
  4155         found = TRUE;
  4156         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
  4157         // (compared to the last CE fetched for the previous targetIx value) as we need to go
  4158         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
  4159         const CEI *lastCEI  = ceb.getPrevious(targetIx);
  4160         if (lastCEI == NULL) {
  4161             *status = U_INTERNAL_PROGRAM_ERROR;
  4162             found = FALSE;
  4163              break;
  4165         //  Inner loop checks for a match beginning at each
  4166         //  position from the outer loop.
  4167         int32_t targetIxOffset = 0;
  4168         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
  4169             int64_t patCE = strsrch->pattern.PCE[patIx];
  4171             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
  4172             //  Compare CE from target string with CE from the pattern.
  4173             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
  4174             //    which will fail the compare, below.
  4175             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
  4176             if ( ceMatch == U_CE_NO_MATCH ) {
  4177                 found = FALSE;
  4178                 break;
  4179             } else if ( ceMatch > U_CE_NO_MATCH ) {
  4180                 if ( ceMatch == U_CE_SKIP_TARG ) {
  4181                     // redo with same patCE, next targCE
  4182                     patIx++;
  4183                     targetIxOffset++;
  4184                 } else { // ceMatch == U_CE_SKIP_PATN
  4185                     // redo with same targCE, next patCE
  4186                     targetIxOffset--;
  4191         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
  4192             // No match at this targetIx.  Try again at the next.
  4193             continue;
  4196         if (!found) {
  4197             // No match at all, we have run off the end of the target text.
  4198             break;
  4202         // We have found a match in CE space.
  4203         // Now determine the bounds in string index space.
  4204         //  There still is a chance of match failure if the CE range not correspond to
  4205         //     an acceptable character range.
  4206         //
  4207         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
  4208         mStart   = firstCEI->lowIndex;
  4210         // Check for the start of the match being within a combining sequence.
  4211         //   This can happen if the pattern itself begins with a combining char, and
  4212         //   the match found combining marks in the target text that were attached
  4213         //    to something else.
  4214         //   This type of match should be rejected for not completely consuming a
  4215         //   combining sequence.
  4216         if (!isBreakBoundary(strsrch, mStart)) {
  4217             found = FALSE;
  4220         // Look at the high index of the first CE in the match. If it's the same as the
  4221         // low index, the first CE in the match is in the middle of an expansion.
  4222         if (mStart == firstCEI->highIndex) {
  4223             found = FALSE;
  4227         minLimit = lastCEI->lowIndex;
  4229         if (targetIx > 0) {
  4230             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
  4231             //   extended to the end of input, and the match is good.
  4233             // Look at the high and low indices of the CE following the match. If
  4234             // they are the same it means one of two things:
  4235             //    1. The match extended to the last CE from the target text, which is OK, or
  4236             //    2. The last CE that was part of the match is in an expansion that extends
  4237             //       to the first CE after the match. In this case, we reject the match.
  4238             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
  4240             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
  4241                 found = FALSE;
  4244             mLimit = maxLimit = nextCEI->lowIndex;
  4246             //  Advance the match end position to the first acceptable match boundary.
  4247             //    This advances the index over any combining charcters.
  4248             if (minLimit < maxLimit) {
  4249                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
  4251                 if (nba >= lastCEI->highIndex) {
  4252                     mLimit = nba;
  4256             // If advancing to the end of a combining sequence in character indexing space
  4257             //   advanced us beyond the end of the match in CE space, reject this match.
  4258             if (mLimit > maxLimit) {
  4259                 found = FALSE;
  4262             // Make sure the end of the match is on a break boundary
  4263             if (!isBreakBoundary(strsrch, mLimit)) {
  4264                 found = FALSE;
  4267         } else {
  4268             // No non-ignorable CEs after this point.
  4269             // The maximum position is detected by boundary after
  4270             // the last non-ignorable CE. Combining sequence
  4271             // across the start index will be truncated.
  4272             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
  4273             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
  4276     #ifdef USEARCH_DEBUG
  4277         if (getenv("USEARCH_DEBUG") != NULL) {
  4278             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
  4280     #endif
  4283         if (! checkIdentical(strsrch, mStart, mLimit)) {
  4284             found = FALSE;
  4287         if (found) {
  4288             break;
  4292     #ifdef USEARCH_DEBUG
  4293     if (getenv("USEARCH_DEBUG") != NULL) {
  4294         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
  4295         int32_t  lastToPrint = ceb.limitIx+2;
  4296         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
  4297             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
  4299         printf("\n%s\n", found? "match found" : "no match");
  4301     #endif
  4303     // All Done.  Store back the match bounds to the caller.
  4304     //
  4305     if (found==FALSE) {
  4306         mLimit = -1;
  4307         mStart = -1;
  4310     if (matchStart != NULL) {
  4311         *matchStart= mStart;
  4314     if (matchLimit != NULL) {
  4315         *matchLimit = mLimit;
  4318     return found;
  4321 // internal use methods declared in usrchimp.h -----------------------------
  4323 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
  4325     if (U_FAILURE(*status)) {
  4326         setMatchNotFound(strsrch);
  4327         return FALSE;
  4330 #if BOYER_MOORE
  4331     UCollationElements *coleiter        = strsrch->textIter;
  4332     int32_t             textlength      = strsrch->search->textLength;
  4333     int32_t            *patternce       = strsrch->pattern.CE;
  4334     int32_t             patterncelength = strsrch->pattern.CELength;
  4335     int32_t             textoffset      = ucol_getOffset(coleiter);
  4337     // status used in setting coleiter offset, since offset is checked in
  4338     // shiftForward before setting the coleiter offset, status never
  4339     // a failure
  4340     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
  4341                               patterncelength);
  4342     while (textoffset <= textlength)
  4344         uint32_t    patternceindex = patterncelength - 1;
  4345         int32_t     targetce;
  4346         UBool       found          = FALSE;
  4347         int32_t    lastce          = UCOL_NULLORDER;
  4349         setColEIterOffset(coleiter, textoffset);
  4351         for (;;) {
  4352             // finding the last pattern ce match, imagine composite characters
  4353             // for example: search for pattern A in text \u00C0
  4354             // we'll have to skip \u0300 the grave first before we get to A
  4355             targetce = ucol_previous(coleiter, status);
  4356             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4357                 found = FALSE;
  4358                 break;
  4360             targetce = getCE(strsrch, targetce);
  4361             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
  4362                 // this is for the text \u0315\u0300 that requires
  4363                 // normalization and pattern \u0300, where \u0315 is ignorable
  4364                 continue;
  4366             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
  4367                 lastce = targetce;
  4369             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4370             if (targetce == patternce[patternceindex]) {
  4371                 // the first ce can be a contraction
  4372                 found = TRUE;
  4373                 break;
  4375             if (!hasExpansion(coleiter)) {
  4376                 found = FALSE;
  4377                 break;
  4381         //targetce = lastce;
  4383         while (found && patternceindex > 0) {
  4384             lastce = targetce;
  4385             targetce    = ucol_previous(coleiter, status);
  4386             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4387                 found = FALSE;
  4388                 break;
  4390             targetce    = getCE(strsrch, targetce);
  4391             if (targetce == UCOL_IGNORABLE) {
  4392                 continue;
  4395             patternceindex --;
  4396             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4397             found = found && targetce == patternce[patternceindex];
  4400         targetce = lastce;
  4402         if (!found) {
  4403             if (U_FAILURE(*status)) {
  4404                 break;
  4406             textoffset = shiftForward(strsrch, textoffset, lastce,
  4407                                       patternceindex);
  4408             // status checked at loop.
  4409             patternceindex = patterncelength;
  4410             continue;
  4413         if (checkNextExactMatch(strsrch, &textoffset, status)) {
  4414             // status checked in ucol_setOffset
  4415             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
  4416             return TRUE;
  4419     setMatchNotFound(strsrch);
  4420     return FALSE;
  4421 #else
  4422     int32_t textOffset = ucol_getOffset(strsrch->textIter);
  4423     int32_t start = -1;
  4424     int32_t end = -1;
  4426     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
  4427         strsrch->search->matchedIndex  = start;
  4428         strsrch->search->matchedLength = end - start;
  4429         return TRUE;
  4430     } else {
  4431         setMatchNotFound(strsrch);
  4432         return FALSE;
  4434 #endif
  4437 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
  4439     if (U_FAILURE(*status)) {
  4440         setMatchNotFound(strsrch);
  4441         return FALSE;
  4444 #if BOYER_MOORE
  4445     UCollationElements *coleiter        = strsrch->textIter;
  4446     int32_t             textlength      = strsrch->search->textLength;
  4447     int32_t            *patternce       = strsrch->pattern.CE;
  4448     int32_t             patterncelength = strsrch->pattern.CELength;
  4449     int32_t             textoffset      = ucol_getOffset(coleiter);
  4450     UBool               hasPatternAccents =
  4451        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
  4453     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
  4454                               patterncelength);
  4455     strsrch->canonicalPrefixAccents[0] = 0;
  4456     strsrch->canonicalSuffixAccents[0] = 0;
  4458     while (textoffset <= textlength)
  4460         int32_t     patternceindex = patterncelength - 1;
  4461         int32_t     targetce;
  4462         UBool       found          = FALSE;
  4463         int32_t     lastce         = UCOL_NULLORDER;
  4465         setColEIterOffset(coleiter, textoffset);
  4467         for (;;) {
  4468             // finding the last pattern ce match, imagine composite characters
  4469             // for example: search for pattern A in text \u00C0
  4470             // we'll have to skip \u0300 the grave first before we get to A
  4471             targetce = ucol_previous(coleiter, status);
  4472             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4473                 found = FALSE;
  4474                 break;
  4476             targetce = getCE(strsrch, targetce);
  4477             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
  4478                 lastce = targetce;
  4480             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4481             if (targetce == patternce[patternceindex]) {
  4482                 // the first ce can be a contraction
  4483                 found = TRUE;
  4484                 break;
  4486             if (!hasExpansion(coleiter)) {
  4487                 found = FALSE;
  4488                 break;
  4492         while (found && patternceindex > 0) {
  4493             targetce    = ucol_previous(coleiter, status);
  4494             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4495                 found = FALSE;
  4496                 break;
  4498             targetce    = getCE(strsrch, targetce);
  4499             if (targetce == UCOL_IGNORABLE) {
  4500                 continue;
  4503             patternceindex --;
  4504             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4505             found = found && targetce == patternce[patternceindex];
  4508         // initializing the rearranged accent array
  4509         if (hasPatternAccents && !found) {
  4510             strsrch->canonicalPrefixAccents[0] = 0;
  4511             strsrch->canonicalSuffixAccents[0] = 0;
  4512             if (U_FAILURE(*status)) {
  4513                 break;
  4515             found = doNextCanonicalMatch(strsrch, textoffset, status);
  4518         if (!found) {
  4519             if (U_FAILURE(*status)) {
  4520                 break;
  4522             textoffset = shiftForward(strsrch, textoffset, lastce,
  4523                                       patternceindex);
  4524             // status checked at loop
  4525             patternceindex = patterncelength;
  4526             continue;
  4529         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
  4530             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
  4531             return TRUE;
  4534     setMatchNotFound(strsrch);
  4535     return FALSE;
  4536 #else
  4537     int32_t textOffset = ucol_getOffset(strsrch->textIter);
  4538     int32_t start = -1;
  4539     int32_t end = -1;
  4541     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
  4542         strsrch->search->matchedIndex  = start;
  4543         strsrch->search->matchedLength = end - start;
  4544         return TRUE;
  4545     } else {
  4546         setMatchNotFound(strsrch);
  4547         return FALSE;
  4549 #endif
  4552 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
  4554     if (U_FAILURE(*status)) {
  4555         setMatchNotFound(strsrch);
  4556         return FALSE;
  4559 #if BOYER_MOORE
  4560     UCollationElements *coleiter        = strsrch->textIter;
  4561     int32_t            *patternce       = strsrch->pattern.CE;
  4562     int32_t             patterncelength = strsrch->pattern.CELength;
  4563     int32_t             textoffset      = ucol_getOffset(coleiter);
  4565     // shifting it check for setting offset
  4566     // if setOffset is called previously or there was no previous match, we
  4567     // leave the offset as it is.
  4568     if (strsrch->search->matchedIndex != USEARCH_DONE) {
  4569         textoffset = strsrch->search->matchedIndex;
  4572     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
  4573                               patterncelength);
  4575     while (textoffset >= 0)
  4577         int32_t     patternceindex = 1;
  4578         int32_t     targetce;
  4579         UBool       found          = FALSE;
  4580         int32_t     firstce        = UCOL_NULLORDER;
  4582         // if status is a failure, ucol_setOffset does nothing
  4583         setColEIterOffset(coleiter, textoffset);
  4585         for (;;) {
  4586             // finding the first pattern ce match, imagine composite
  4587             // characters. for example: search for pattern \u0300 in text
  4588             // \u00C0, we'll have to skip A first before we get to
  4589             // \u0300 the grave accent
  4590             targetce = ucol_next(coleiter, status);
  4591             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4592                 found = FALSE;
  4593                 break;
  4595             targetce = getCE(strsrch, targetce);
  4596             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
  4597                 firstce = targetce;
  4599             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
  4600                 continue;
  4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4603             if (targetce == patternce[0]) {
  4604                 found = TRUE;
  4605                 break;
  4607             if (!hasExpansion(coleiter)) {
  4608                 // checking for accents in composite character
  4609                 found = FALSE;
  4610                 break;
  4614         //targetce = firstce;
  4616         while (found && (patternceindex < patterncelength)) {
  4617             firstce = targetce;
  4618             targetce    = ucol_next(coleiter, status);
  4619             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4620                 found = FALSE;
  4621                 break;
  4623             targetce    = getCE(strsrch, targetce);
  4624             if (targetce == UCOL_IGNORABLE) {
  4625                 continue;
  4628             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4629             found = found && targetce == patternce[patternceindex];
  4630             patternceindex ++;
  4633         targetce = firstce;
  4635         if (!found) {
  4636             if (U_FAILURE(*status)) {
  4637                 break;
  4640             textoffset = reverseShift(strsrch, textoffset, targetce,
  4641                                       patternceindex);
  4642             patternceindex = 0;
  4643             continue;
  4646         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
  4647             setColEIterOffset(coleiter, textoffset);
  4648             return TRUE;
  4651     setMatchNotFound(strsrch);
  4652     return FALSE;
  4653 #else
  4654     int32_t textOffset;
  4656     if (strsrch->search->isOverlap) {
  4657         if (strsrch->search->matchedIndex != USEARCH_DONE) {
  4658             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
  4659         } else {
  4660             // move the start position at the end of possible match
  4661             initializePatternPCETable(strsrch, status);
  4662             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
  4663                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);
  4664                 if (pce == UCOL_PROCESSED_NULLORDER) {
  4665                     // at the end of the text
  4666                     break;
  4669             if (U_FAILURE(*status)) {
  4670                 setMatchNotFound(strsrch);
  4671                 return FALSE;
  4673             textOffset = ucol_getOffset(strsrch->textIter);
  4675     } else {
  4676         textOffset = ucol_getOffset(strsrch->textIter);
  4679     int32_t start = -1;
  4680     int32_t end = -1;
  4682     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
  4683         strsrch->search->matchedIndex = start;
  4684         strsrch->search->matchedLength = end - start;
  4685         return TRUE;
  4686     } else {
  4687         setMatchNotFound(strsrch);
  4688         return FALSE;
  4690 #endif
  4693 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
  4694                                       UErrorCode    *status)
  4696     if (U_FAILURE(*status)) {
  4697         setMatchNotFound(strsrch);
  4698         return FALSE;
  4701 #if BOYER_MOORE
  4702     UCollationElements *coleiter        = strsrch->textIter;
  4703     int32_t            *patternce       = strsrch->pattern.CE;
  4704     int32_t             patterncelength = strsrch->pattern.CELength;
  4705     int32_t             textoffset      = ucol_getOffset(coleiter);
  4706     UBool               hasPatternAccents =
  4707        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
  4709     // shifting it check for setting offset
  4710     // if setOffset is called previously or there was no previous match, we
  4711     // leave the offset as it is.
  4712     if (strsrch->search->matchedIndex != USEARCH_DONE) {
  4713         textoffset = strsrch->search->matchedIndex;
  4716     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
  4717                               patterncelength);
  4718     strsrch->canonicalPrefixAccents[0] = 0;
  4719     strsrch->canonicalSuffixAccents[0] = 0;
  4721     while (textoffset >= 0)
  4723         int32_t     patternceindex = 1;
  4724         int32_t     targetce;
  4725         UBool       found          = FALSE;
  4726         int32_t     firstce        = UCOL_NULLORDER;
  4728         setColEIterOffset(coleiter, textoffset);
  4729         for (;;) {
  4730             // finding the first pattern ce match, imagine composite
  4731             // characters. for example: search for pattern \u0300 in text
  4732             // \u00C0, we'll have to skip A first before we get to
  4733             // \u0300 the grave accent
  4734             targetce = ucol_next(coleiter, status);
  4735             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4736                 found = FALSE;
  4737                 break;
  4739             targetce = getCE(strsrch, targetce);
  4740             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
  4741                 firstce = targetce;
  4744             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4745             if (targetce == patternce[0]) {
  4746                 // the first ce can be a contraction
  4747                 found = TRUE;
  4748                 break;
  4750             if (!hasExpansion(coleiter)) {
  4751                 // checking for accents in composite character
  4752                 found = FALSE;
  4753                 break;
  4757         targetce = firstce;
  4759         while (found && patternceindex < patterncelength) {
  4760             targetce    = ucol_next(coleiter, status);
  4761             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
  4762                 found = FALSE;
  4763                 break;
  4765             targetce = getCE(strsrch, targetce);
  4766             if (targetce == UCOL_IGNORABLE) {
  4767                 continue;
  4770             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
  4771             found = found && targetce == patternce[patternceindex];
  4772             patternceindex ++;
  4775         // initializing the rearranged accent array
  4776         if (hasPatternAccents && !found) {
  4777             strsrch->canonicalPrefixAccents[0] = 0;
  4778             strsrch->canonicalSuffixAccents[0] = 0;
  4779             if (U_FAILURE(*status)) {
  4780                 break;
  4782             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
  4785         if (!found) {
  4786             if (U_FAILURE(*status)) {
  4787                 break;
  4789             textoffset = reverseShift(strsrch, textoffset, targetce,
  4790                                       patternceindex);
  4791             patternceindex = 0;
  4792             continue;
  4795         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
  4796             setColEIterOffset(coleiter, textoffset);
  4797             return TRUE;
  4800     setMatchNotFound(strsrch);
  4801     return FALSE;
  4802 #else
  4803     int32_t textOffset;
  4805     if (strsrch->search->isOverlap) {
  4806         if (strsrch->search->matchedIndex != USEARCH_DONE) {
  4807             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
  4808         } else {
  4809             // move the start position at the end of possible match
  4810             initializePatternPCETable(strsrch, status);
  4811             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
  4812                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);
  4813                 if (pce == UCOL_PROCESSED_NULLORDER) {
  4814                     // at the end of the text
  4815                     break;
  4818             if (U_FAILURE(*status)) {
  4819                 setMatchNotFound(strsrch);
  4820                 return FALSE;
  4822             textOffset = ucol_getOffset(strsrch->textIter);
  4824     } else {
  4825         textOffset = ucol_getOffset(strsrch->textIter);
  4828     int32_t start = -1;
  4829     int32_t end = -1;
  4831     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
  4832         strsrch->search->matchedIndex = start;
  4833         strsrch->search->matchedLength = end - start;
  4834         return TRUE;
  4835     } else {
  4836         setMatchNotFound(strsrch);
  4837         return FALSE;
  4839 #endif
  4842 #endif /* #if !UCONFIG_NO_COLLATION */

mercurial