michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2001-2011 IBM and others. All rights reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 07/02/2001 synwee Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/usearch.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/utf16.h" michael@0: #include "normalizer2impl.h" michael@0: #include "ucol_imp.h" michael@0: #include "usrchimp.h" michael@0: #include "cmemory.h" michael@0: #include "ucln_in.h" michael@0: #include "uassert.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: // don't use Boyer-Moore michael@0: // (and if we decide to turn this on again there are several new TODOs that will need to be addressed) michael@0: #define BOYER_MOORE 0 michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: // internal definition --------------------------------------------------- michael@0: michael@0: #define LAST_BYTE_MASK_ 0xFF michael@0: #define SECOND_LAST_BYTE_SHIFT_ 8 michael@0: #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 michael@0: michael@0: static const Normalizer2Impl *g_nfcImpl = NULL; michael@0: michael@0: // internal methods ------------------------------------------------- michael@0: michael@0: /** michael@0: * Fast collation element iterator setOffset. michael@0: * This function does not check for bounds. michael@0: * @param coleiter collation element iterator michael@0: * @param offset to set michael@0: */ michael@0: static michael@0: inline void setColEIterOffset(UCollationElements *elems, michael@0: int32_t offset) michael@0: { michael@0: collIterate *ci = &(elems->iteratordata_); michael@0: ci->pos = ci->string + offset; michael@0: ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs; michael@0: if (ci->flags & UCOL_ITER_INNORMBUF) { michael@0: ci->flags = ci->origFlags; michael@0: } michael@0: ci->fcdPosition = NULL; michael@0: michael@0: ci->offsetReturn = NULL; michael@0: ci->offsetStore = ci->offsetBuffer; michael@0: ci->offsetRepeatCount = ci->offsetRepeatValue = 0; michael@0: } michael@0: michael@0: /** michael@0: * Getting the mask for collation strength michael@0: * @param strength collation strength michael@0: * @return collation element mask michael@0: */ michael@0: static michael@0: inline uint32_t getMask(UCollationStrength strength) michael@0: { michael@0: switch (strength) michael@0: { michael@0: case UCOL_PRIMARY: michael@0: return UCOL_PRIMARYORDERMASK; michael@0: case UCOL_SECONDARY: michael@0: return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; michael@0: default: michael@0: return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | michael@0: UCOL_PRIMARYORDERMASK; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * This is to squeeze the 21bit ces into a 256 table michael@0: * @param ce collation element michael@0: * @return collapsed version of the collation element michael@0: */ michael@0: static michael@0: inline int hash(uint32_t ce) michael@0: { michael@0: // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work michael@0: // well with the new collation where most of the latin 1 characters michael@0: // are of the value xx000xxx. their hashes will most of the time be 0 michael@0: // to be discussed on the hash algo. michael@0: return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV michael@0: usearch_cleanup(void) { michael@0: g_nfcImpl = NULL; michael@0: return TRUE; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: /** michael@0: * Initializing the fcd tables. michael@0: * Internal method, status assumed to be a success. michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: */ michael@0: static michael@0: inline void initializeFCD(UErrorCode *status) michael@0: { michael@0: if (g_nfcImpl == NULL) { michael@0: g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); michael@0: ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Gets the fcd value for a character at the argument index. michael@0: * This method takes into accounts of the supplementary characters. michael@0: * @param str UTF16 string where character for fcd retrieval resides michael@0: * @param offset position of the character whose fcd is to be retrieved, to be michael@0: * overwritten with the next character position, taking michael@0: * surrogate characters into consideration. michael@0: * @param strlength length of the argument string michael@0: * @return fcd value michael@0: */ michael@0: static michael@0: uint16_t getFCD(const UChar *str, int32_t *offset, michael@0: int32_t strlength) michael@0: { michael@0: const UChar *temp = str + *offset; michael@0: uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); michael@0: *offset = (int32_t)(temp - str); michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Getting the modified collation elements taking into account the collation michael@0: * attributes michael@0: * @param strsrch string search data michael@0: * @param sourcece michael@0: * @return the modified collation element michael@0: */ michael@0: static michael@0: inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) michael@0: { michael@0: // note for tertiary we can't use the collator->tertiaryMask, that michael@0: // is a preprocessed mask that takes into account case options. since michael@0: // we are only concerned with exact matches, we don't need that. michael@0: sourcece &= strsrch->ceMask; michael@0: michael@0: if (strsrch->toShift) { michael@0: // alternate handling here, since only the 16 most significant digits michael@0: // is only used, we can safely do a compare without masking michael@0: // if the ce is a variable, we mask and get only the primary values michael@0: // no shifting to quartenary is required since all primary values michael@0: // less than variabletop will need to be masked off anyway. michael@0: if (strsrch->variableTop > sourcece) { michael@0: if (strsrch->strength >= UCOL_QUATERNARY) { michael@0: sourcece &= UCOL_PRIMARYORDERMASK; michael@0: } michael@0: else { michael@0: sourcece = UCOL_IGNORABLE; michael@0: } michael@0: } michael@0: } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { michael@0: sourcece = 0xFFFF; michael@0: } michael@0: michael@0: return sourcece; michael@0: } michael@0: michael@0: /** michael@0: * Allocate a memory and returns NULL if it failed. michael@0: * Internal method, status assumed to be a success. michael@0: * @param size to allocate michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return newly allocated array, NULL otherwise michael@0: */ michael@0: static michael@0: inline void * allocateMemory(uint32_t size, UErrorCode *status) michael@0: { michael@0: uint32_t *result = (uint32_t *)uprv_malloc(size); michael@0: if (result == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Adds a uint32_t value to a destination array. michael@0: * Creates a new array if we run out of space. The caller will have to michael@0: * manually deallocate the newly allocated array. michael@0: * Internal method, status assumed to be success, caller has to check status michael@0: * before calling this method. destination not to be NULL and has at least michael@0: * size destinationlength. michael@0: * @param destination target array michael@0: * @param offset destination offset to add value michael@0: * @param destinationlength target array size, return value for the new size michael@0: * @param value to be added michael@0: * @param increments incremental size expected michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return new destination array, destination if there was no new allocation michael@0: */ michael@0: static michael@0: inline int32_t * addTouint32_tArray(int32_t *destination, michael@0: uint32_t offset, michael@0: uint32_t *destinationlength, michael@0: uint32_t value, michael@0: uint32_t increments, michael@0: UErrorCode *status) michael@0: { michael@0: uint32_t newlength = *destinationlength; michael@0: if (offset + 1 == newlength) { michael@0: newlength += increments; michael@0: int32_t *temp = (int32_t *)allocateMemory( michael@0: sizeof(int32_t) * newlength, status); michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: uprv_memcpy(temp, destination, sizeof(int32_t) * offset); michael@0: *destinationlength = newlength; michael@0: destination = temp; michael@0: } michael@0: destination[offset] = value; michael@0: return destination; michael@0: } michael@0: michael@0: /** michael@0: * Adds a uint64_t value to a destination array. michael@0: * Creates a new array if we run out of space. The caller will have to michael@0: * manually deallocate the newly allocated array. michael@0: * Internal method, status assumed to be success, caller has to check status michael@0: * before calling this method. destination not to be NULL and has at least michael@0: * size destinationlength. michael@0: * @param destination target array michael@0: * @param offset destination offset to add value michael@0: * @param destinationlength target array size, return value for the new size michael@0: * @param value to be added michael@0: * @param increments incremental size expected michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return new destination array, destination if there was no new allocation michael@0: */ michael@0: static michael@0: inline int64_t * addTouint64_tArray(int64_t *destination, michael@0: uint32_t offset, michael@0: uint32_t *destinationlength, michael@0: uint64_t value, michael@0: uint32_t increments, michael@0: UErrorCode *status) michael@0: { michael@0: uint32_t newlength = *destinationlength; michael@0: if (offset + 1 == newlength) { michael@0: newlength += increments; michael@0: int64_t *temp = (int64_t *)allocateMemory( michael@0: sizeof(int64_t) * newlength, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: michael@0: uprv_memcpy(temp, destination, sizeof(int64_t) * offset); michael@0: *destinationlength = newlength; michael@0: destination = temp; michael@0: } michael@0: michael@0: destination[offset] = value; michael@0: michael@0: return destination; michael@0: } michael@0: michael@0: /** michael@0: * Initializing the ce table for a pattern. michael@0: * Stores non-ignorable collation keys. michael@0: * Table size will be estimated by the size of the pattern text. Table michael@0: * expansion will be perform as we go along. Adding 1 to ensure that the table michael@0: * size definitely increases. michael@0: * Internal method, status assumed to be a success. michael@0: * @param strsrch string search data michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return total number of expansions michael@0: */ michael@0: static michael@0: inline uint16_t initializePatternCETable(UStringSearch *strsrch, michael@0: UErrorCode *status) michael@0: { michael@0: UPattern *pattern = &(strsrch->pattern); michael@0: uint32_t cetablesize = INITIAL_ARRAY_SIZE_; michael@0: int32_t *cetable = pattern->CEBuffer; michael@0: uint32_t patternlength = pattern->textLength; michael@0: UCollationElements *coleiter = strsrch->utilIter; michael@0: michael@0: if (coleiter == NULL) { michael@0: coleiter = ucol_openElements(strsrch->collator, pattern->text, michael@0: patternlength, status); michael@0: // status will be checked in ucol_next(..) later and if it is an michael@0: // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be michael@0: // returned. michael@0: strsrch->utilIter = coleiter; michael@0: } michael@0: else { michael@0: uprv_init_collIterate(strsrch->collator, pattern->text, michael@0: pattern->textLength, michael@0: &coleiter->iteratordata_, michael@0: status); michael@0: } michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: if (pattern->CE != cetable && pattern->CE) { michael@0: uprv_free(pattern->CE); michael@0: } michael@0: michael@0: uint16_t offset = 0; michael@0: uint16_t result = 0; michael@0: int32_t ce; michael@0: michael@0: while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && michael@0: U_SUCCESS(*status)) { michael@0: uint32_t newce = getCE(strsrch, ce); michael@0: if (newce) { michael@0: int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, michael@0: newce, michael@0: patternlength - ucol_getOffset(coleiter) + 1, michael@0: status); michael@0: if (U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: offset ++; michael@0: if (cetable != temp && cetable != pattern->CEBuffer) { michael@0: uprv_free(cetable); michael@0: } michael@0: cetable = temp; michael@0: } michael@0: result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); michael@0: } michael@0: michael@0: cetable[offset] = 0; michael@0: pattern->CE = cetable; michael@0: pattern->CELength = offset; michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Initializing the pce table for a pattern. michael@0: * Stores non-ignorable collation keys. michael@0: * Table size will be estimated by the size of the pattern text. Table michael@0: * expansion will be perform as we go along. Adding 1 to ensure that the table michael@0: * size definitely increases. michael@0: * Internal method, status assumed to be a success. michael@0: * @param strsrch string search data michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return total number of expansions michael@0: */ michael@0: static michael@0: inline uint16_t initializePatternPCETable(UStringSearch *strsrch, michael@0: UErrorCode *status) michael@0: { michael@0: UPattern *pattern = &(strsrch->pattern); michael@0: uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; michael@0: int64_t *pcetable = pattern->PCEBuffer; michael@0: uint32_t patternlength = pattern->textLength; michael@0: UCollationElements *coleiter = strsrch->utilIter; michael@0: michael@0: if (coleiter == NULL) { michael@0: coleiter = ucol_openElements(strsrch->collator, pattern->text, michael@0: patternlength, status); michael@0: // status will be checked in ucol_next(..) later and if it is an michael@0: // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be michael@0: // returned. michael@0: strsrch->utilIter = coleiter; michael@0: } else { michael@0: uprv_init_collIterate(strsrch->collator, pattern->text, michael@0: pattern->textLength, michael@0: &coleiter->iteratordata_, michael@0: status); michael@0: } michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: if (pattern->PCE != pcetable && pattern->PCE != NULL) { michael@0: uprv_free(pattern->PCE); michael@0: } michael@0: michael@0: uint16_t offset = 0; michael@0: uint16_t result = 0; michael@0: int64_t pce; michael@0: michael@0: uprv_init_pce(coleiter); michael@0: michael@0: // ** Should processed CEs be signed or unsigned? michael@0: // ** (the rest of the code in this file seems to play fast-and-loose with michael@0: // ** whether a CE is signed or unsigned. For example, look at routine above this one.) michael@0: while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && michael@0: U_SUCCESS(*status)) { michael@0: int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, michael@0: pce, michael@0: patternlength - ucol_getOffset(coleiter) + 1, michael@0: status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: offset += 1; michael@0: michael@0: if (pcetable != temp && pcetable != pattern->PCEBuffer) { michael@0: uprv_free(pcetable); michael@0: } michael@0: michael@0: pcetable = temp; michael@0: //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); michael@0: } michael@0: michael@0: pcetable[offset] = 0; michael@0: pattern->PCE = pcetable; michael@0: pattern->PCELength = offset; michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Initializes the pattern struct. michael@0: * Internal method, status assumed to be success. michael@0: * @param strsrch UStringSearch data storage michael@0: * @param status output error if any, caller to check status before calling michael@0: * method, status assumed to be success when passed in. michael@0: * @return expansionsize the total expansion size of the pattern michael@0: */ michael@0: static michael@0: inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) michael@0: { michael@0: UPattern *pattern = &(strsrch->pattern); michael@0: const UChar *patterntext = pattern->text; michael@0: int32_t length = pattern->textLength; michael@0: int32_t index = 0; michael@0: michael@0: // Since the strength is primary, accents are ignored in the pattern. michael@0: if (strsrch->strength == UCOL_PRIMARY) { michael@0: pattern->hasPrefixAccents = 0; michael@0: pattern->hasSuffixAccents = 0; michael@0: } else { michael@0: pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> michael@0: SECOND_LAST_BYTE_SHIFT_; michael@0: index = length; michael@0: U16_BACK_1(patterntext, 0, index); michael@0: pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & michael@0: LAST_BYTE_MASK_; michael@0: } michael@0: michael@0: // ** HACK ** michael@0: if (strsrch->pattern.PCE != NULL) { michael@0: if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { michael@0: uprv_free(strsrch->pattern.PCE); michael@0: } michael@0: michael@0: strsrch->pattern.PCE = NULL; michael@0: } michael@0: michael@0: // since intializePattern is an internal method status is a success. michael@0: return initializePatternCETable(strsrch, status); michael@0: } michael@0: michael@0: /** michael@0: * Initializing shift tables, with the default values. michael@0: * If a corresponding default value is 0, the shift table is not set. michael@0: * @param shift table for forwards shift michael@0: * @param backshift table for backwards shift michael@0: * @param cetable table containing pattern ce michael@0: * @param cesize size of the pattern ces michael@0: * @param expansionsize total size of the expansions michael@0: * @param defaultforward the default forward value michael@0: * @param defaultbackward the default backward value michael@0: */ michael@0: static michael@0: inline void setShiftTable(int16_t shift[], int16_t backshift[], michael@0: int32_t *cetable, int32_t cesize, michael@0: int16_t expansionsize, michael@0: int16_t defaultforward, michael@0: int16_t defaultbackward) michael@0: { michael@0: // estimate the value to shift. to do that we estimate the smallest michael@0: // number of characters to give the relevant ces, ie approximately michael@0: // the number of ces minus their expansion, since expansions can come michael@0: // from a character. michael@0: int32_t count; michael@0: for (count = 0; count < MAX_TABLE_SIZE_; count ++) { michael@0: shift[count] = defaultforward; michael@0: } michael@0: cesize --; // down to the last index michael@0: for (count = 0; count < cesize; count ++) { michael@0: // number of ces from right of array to the count michael@0: int temp = defaultforward - count - 1; michael@0: shift[hash(cetable[count])] = temp > 1 ? temp : 1; michael@0: } michael@0: shift[hash(cetable[cesize])] = 1; michael@0: // for ignorables we just shift by one. see test examples. michael@0: shift[hash(0)] = 1; michael@0: michael@0: for (count = 0; count < MAX_TABLE_SIZE_; count ++) { michael@0: backshift[count] = defaultbackward; michael@0: } michael@0: for (count = cesize; count > 0; count --) { michael@0: // the original value count does not seem to work michael@0: backshift[hash(cetable[count])] = count > expansionsize ? michael@0: (int16_t)(count - expansionsize) : 1; michael@0: } michael@0: backshift[hash(cetable[0])] = 1; michael@0: backshift[hash(0)] = 1; michael@0: } michael@0: michael@0: /** michael@0: * Building of the pattern collation element list and the boyer moore strsrch michael@0: * table. michael@0: * The canonical match will only be performed after the default match fails. michael@0: * For both cases we need to remember the size of the composed and decomposed michael@0: * versions of the string. Since the Boyer-Moore shift calculations shifts by michael@0: * a number of characters in the text and tries to match the pattern from that michael@0: * offset, the shift value can not be too large in case we miss some michael@0: * characters. To choose a right shift size, we estimate the NFC form of the michael@0: * and use its size as a shift guide. The NFC form should be the small michael@0: * possible representation of the pattern. Anyways, we'll err on the smaller michael@0: * shift size. Hence the calculation for minlength. michael@0: * Canonical match will be performed slightly differently. We'll split the michael@0: * pattern into 3 parts, the prefix accents (PA), the middle string bounded by michael@0: * the first and last base character (MS), the ending accents (EA). Matches michael@0: * will be done on MS first, and only when we match MS then some processing michael@0: * will be required for the prefix and end accents in order to determine if michael@0: * they match PA and EA. Hence the default shift values michael@0: * for the canonical match will take the size of either end's accent into michael@0: * consideration. Forwards search will take the end accents into consideration michael@0: * for the default shift values and the backwards search will take the prefix michael@0: * accents into consideration. michael@0: * If pattern has no non-ignorable ce, we return a illegal argument error. michael@0: * Internal method, status assumed to be success. michael@0: * @param strsrch UStringSearch data storage michael@0: * @param status for output errors if it occurs, status is assumed to be a michael@0: * success when it is passed in. michael@0: */ michael@0: static michael@0: inline void initialize(UStringSearch *strsrch, UErrorCode *status) michael@0: { michael@0: int16_t expandlength = initializePattern(strsrch, status); michael@0: if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { michael@0: UPattern *pattern = &strsrch->pattern; michael@0: int32_t cesize = pattern->CELength; michael@0: michael@0: int16_t minlength = cesize > expandlength michael@0: ? (int16_t)cesize - expandlength : 1; michael@0: pattern->defaultShiftSize = minlength; michael@0: setShiftTable(pattern->shift, pattern->backShift, pattern->CE, michael@0: cesize, expandlength, minlength, minlength); michael@0: return; michael@0: } michael@0: strsrch->pattern.defaultShiftSize = 0; michael@0: } michael@0: michael@0: #if BOYER_MOORE michael@0: /** michael@0: * Check to make sure that the match length is at the end of the character by michael@0: * using the breakiterator. michael@0: * @param strsrch string search data michael@0: * @param start target text start offset michael@0: * @param end target text end offset michael@0: */ michael@0: static michael@0: void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, michael@0: int32_t *end) michael@0: { michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: UBreakIterator *breakiterator = strsrch->search->internalBreakIter; michael@0: if (breakiterator) { michael@0: int32_t matchend = *end; michael@0: //int32_t matchstart = *start; michael@0: michael@0: if (!ubrk_isBoundary(breakiterator, matchend)) { michael@0: *end = ubrk_following(breakiterator, matchend); michael@0: } michael@0: michael@0: /* Check the start of the matched text to make sure it doesn't have any accents michael@0: * before it. This code may not be necessary and so it is commented out */ michael@0: /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { michael@0: *start = ubrk_preceding(breakiterator, matchstart); michael@0: }*/ michael@0: } michael@0: #endif michael@0: } michael@0: michael@0: /** michael@0: * Determine whether the target text in UStringSearch bounded by the offset michael@0: * start and end is one or more whole units of text as michael@0: * determined by the breakiterator in UStringSearch. michael@0: * @param strsrch string search data michael@0: * @param start target text start offset michael@0: * @param end target text end offset michael@0: */ michael@0: static michael@0: UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, michael@0: int32_t end) michael@0: { michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: UBreakIterator *breakiterator = strsrch->search->breakIter; michael@0: //TODO: Add here. michael@0: if (breakiterator) { michael@0: int32_t startindex = ubrk_first(breakiterator); michael@0: int32_t endindex = ubrk_last(breakiterator); michael@0: michael@0: // out-of-range indexes are never boundary positions michael@0: if (start < startindex || start > endindex || michael@0: end < startindex || end > endindex) { michael@0: return FALSE; michael@0: } michael@0: // otherwise, we can use following() on the position before the michael@0: // specified one and return true of the position we get back is the michael@0: // one the user specified michael@0: UBool result = (start == startindex || michael@0: ubrk_following(breakiterator, start - 1) == start) && michael@0: (end == endindex || michael@0: ubrk_following(breakiterator, end - 1) == end); michael@0: if (result) { michael@0: // iterates the individual ces michael@0: UCollationElements *coleiter = strsrch->utilIter; michael@0: const UChar *text = strsrch->search->text + michael@0: start; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: ucol_setText(coleiter, text, end - start, &status); michael@0: for (int32_t count = 0; count < strsrch->pattern.CELength; michael@0: count ++) { michael@0: int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); michael@0: if (ce == UCOL_IGNORABLE) { michael@0: count --; michael@0: continue; michael@0: } michael@0: if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: int32_t nextce = ucol_next(coleiter, &status); michael@0: while (ucol_getOffset(coleiter) == (end - start) michael@0: && getCE(strsrch, nextce) == UCOL_IGNORABLE) { michael@0: nextce = ucol_next(coleiter, &status); michael@0: } michael@0: if (ucol_getOffset(coleiter) == (end - start) michael@0: && nextce != UCOL_NULLORDER) { michael@0: // extra collation elements at the end of the match michael@0: return FALSE; michael@0: } michael@0: } michael@0: return result; michael@0: } michael@0: #endif michael@0: return TRUE; michael@0: } michael@0: michael@0: /** michael@0: * Getting the next base character offset if current offset is an accent, michael@0: * or the current offset if the current character contains a base character. michael@0: * accents the following base character will be returned michael@0: * @param text string michael@0: * @param textoffset current offset michael@0: * @param textlength length of text string michael@0: * @return the next base character or the current offset michael@0: * if the current character is contains a base character. michael@0: */ michael@0: static michael@0: inline int32_t getNextBaseOffset(const UChar *text, michael@0: int32_t textoffset, michael@0: int32_t textlength) michael@0: { michael@0: if (textoffset < textlength) { michael@0: int32_t temp = textoffset; michael@0: if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { michael@0: while (temp < textlength) { michael@0: int32_t result = temp; michael@0: if ((getFCD(text, &temp, textlength) >> michael@0: SECOND_LAST_BYTE_SHIFT_) == 0) { michael@0: return result; michael@0: } michael@0: } michael@0: return textlength; michael@0: } michael@0: } michael@0: return textoffset; michael@0: } michael@0: michael@0: /** michael@0: * Gets the next base character offset depending on the string search pattern michael@0: * data michael@0: * @param strsrch string search data michael@0: * @param textoffset current offset, one offset away from the last character michael@0: * to search for. michael@0: * @return start index of the next base character or the current offset michael@0: * if the current character is contains a base character. michael@0: */ michael@0: static michael@0: inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, michael@0: int32_t textoffset) michael@0: { michael@0: int32_t textlength = strsrch->search->textLength; michael@0: if (strsrch->pattern.hasSuffixAccents && michael@0: textoffset < textlength) { michael@0: int32_t temp = textoffset; michael@0: const UChar *text = strsrch->search->text; michael@0: U16_BACK_1(text, 0, temp); michael@0: if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { michael@0: return getNextBaseOffset(text, textoffset, textlength); michael@0: } michael@0: } michael@0: return textoffset; michael@0: } michael@0: michael@0: /** michael@0: * Shifting the collation element iterator position forward to prepare for michael@0: * a following match. If the last character is a unsafe character, we'll only michael@0: * shift by 1 to capture contractions, normalization etc. michael@0: * Internal method, status assumed to be success. michael@0: * @param text strsrch string search data michael@0: * @param textoffset start text position to do search michael@0: * @param ce the text ce which failed the match. michael@0: * @param patternceindex index of the ce within the pattern ce buffer which michael@0: * failed the match michael@0: * @return final offset michael@0: */ michael@0: static michael@0: inline int32_t shiftForward(UStringSearch *strsrch, michael@0: int32_t textoffset, michael@0: int32_t ce, michael@0: int32_t patternceindex) michael@0: { michael@0: UPattern *pattern = &(strsrch->pattern); michael@0: if (ce != UCOL_NULLORDER) { michael@0: int32_t shift = pattern->shift[hash(ce)]; michael@0: // this is to adjust for characters in the middle of the michael@0: // substring for matching that failed. michael@0: int32_t adjust = pattern->CELength - patternceindex; michael@0: if (adjust > 1 && shift >= adjust) { michael@0: shift -= adjust - 1; michael@0: } michael@0: textoffset += shift; michael@0: } michael@0: else { michael@0: textoffset += pattern->defaultShiftSize; michael@0: } michael@0: michael@0: textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); michael@0: // check for unsafe characters michael@0: // * if it is the start or middle of a contraction: to be done after michael@0: // a initial match is found michael@0: // * thai or lao base consonant character: similar to contraction michael@0: // * high surrogate character: similar to contraction michael@0: // * next character is a accent: shift to the next base character michael@0: return textoffset; michael@0: } michael@0: #endif // #if BOYER_MOORE michael@0: michael@0: /** michael@0: * sets match not found michael@0: * @param strsrch string search data michael@0: */ michael@0: static michael@0: inline void setMatchNotFound(UStringSearch *strsrch) michael@0: { michael@0: // this method resets the match result regardless of the error status. michael@0: strsrch->search->matchedIndex = USEARCH_DONE; michael@0: strsrch->search->matchedLength = 0; michael@0: if (strsrch->search->isForwardSearching) { michael@0: setColEIterOffset(strsrch->textIter, strsrch->search->textLength); michael@0: } michael@0: else { michael@0: setColEIterOffset(strsrch->textIter, 0); michael@0: } michael@0: } michael@0: michael@0: #if BOYER_MOORE michael@0: /** michael@0: * Gets the offset to the next safe point in text. michael@0: * ie. not the middle of a contraction, swappable characters or supplementary michael@0: * characters. michael@0: * @param collator collation sata michael@0: * @param text string to work with michael@0: * @param textoffset offset in string michael@0: * @param textlength length of text string michael@0: * @return offset to the next safe character michael@0: */ michael@0: static michael@0: inline int32_t getNextSafeOffset(const UCollator *collator, michael@0: const UChar *text, michael@0: int32_t textoffset, michael@0: int32_t textlength) michael@0: { michael@0: int32_t result = textoffset; // first contraction character michael@0: while (result != textlength && ucol_unsafeCP(text[result], collator)) { michael@0: result ++; michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * This checks for accents in the potential match started with a . michael@0: * composite character. michael@0: * This is really painful... we have to check that composite character do not michael@0: * have any extra accents. We have to normalize the potential match and find michael@0: * the immediate decomposed character before the match. michael@0: * The first composite character would have been taken care of by the fcd michael@0: * checks in checkForwardExactMatch. michael@0: * This is the slow path after the fcd of the first character and michael@0: * the last character has been checked by checkForwardExactMatch and we michael@0: * determine that the potential match has extra non-ignorable preceding michael@0: * ces. michael@0: * E.g. looking for \u0301 acute in \u01FA A ring above and acute, michael@0: * checkExtraMatchAccent should fail since there is a middle ring in \u01FA michael@0: * Note here that accents checking are slow and cautioned in the API docs. michael@0: * Internal method, status assumed to be a success, caller should check status michael@0: * before calling this method michael@0: * @param strsrch string search data michael@0: * @param start index of the potential unfriendly composite character michael@0: * @param end index of the potential unfriendly composite character michael@0: * @param status output error status if any. michael@0: * @return TRUE if there is non-ignorable accents before at the beginning michael@0: * of the match, FALSE otherwise. michael@0: */ michael@0: michael@0: static michael@0: UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, michael@0: int32_t end, michael@0: UErrorCode *status) michael@0: { michael@0: UBool result = FALSE; michael@0: if (strsrch->pattern.hasPrefixAccents) { michael@0: int32_t length = end - start; michael@0: int32_t offset = 0; michael@0: const UChar *text = strsrch->search->text + start; michael@0: michael@0: U16_FWD_1(text, offset, length); michael@0: // we are only concerned with the first composite character michael@0: if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { michael@0: int32_t safeoffset = getNextSafeOffset(strsrch->collator, michael@0: text, 0, length); michael@0: if (safeoffset != length) { michael@0: safeoffset ++; michael@0: } michael@0: UChar *norm = NULL; michael@0: UChar buffer[INITIAL_ARRAY_SIZE_]; michael@0: int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, michael@0: buffer, INITIAL_ARRAY_SIZE_, michael@0: status); michael@0: if (U_FAILURE(*status)) { michael@0: return FALSE; michael@0: } michael@0: if (size >= INITIAL_ARRAY_SIZE_) { michael@0: norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), michael@0: status); michael@0: // if allocation failed, status will be set to michael@0: // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally michael@0: // checks for it. michael@0: size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, michael@0: size, status); michael@0: if (U_FAILURE(*status) && norm != NULL) { michael@0: uprv_free(norm); michael@0: return FALSE; michael@0: } michael@0: } michael@0: else { michael@0: norm = buffer; michael@0: } michael@0: michael@0: UCollationElements *coleiter = strsrch->utilIter; michael@0: ucol_setText(coleiter, norm, size, status); michael@0: uint32_t firstce = strsrch->pattern.CE[0]; michael@0: UBool ignorable = TRUE; michael@0: uint32_t ce = UCOL_IGNORABLE; michael@0: while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { michael@0: offset = ucol_getOffset(coleiter); michael@0: if (ce != firstce && ce != UCOL_IGNORABLE) { michael@0: ignorable = FALSE; michael@0: } michael@0: ce = ucol_next(coleiter, status); michael@0: } michael@0: UChar32 codepoint; michael@0: U16_PREV(norm, 0, offset, codepoint); michael@0: result = !ignorable && (u_getCombiningClass(codepoint) != 0); michael@0: michael@0: if (norm != buffer) { michael@0: uprv_free(norm); michael@0: } michael@0: } michael@0: } michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Used by exact matches, checks if there are accents before the match. michael@0: * This is really painful... we have to check that composite characters at michael@0: * the start of the matches have to not have any extra accents. michael@0: * We check the FCD of the character first, if it starts with an accent and michael@0: * the first pattern ce does not match the first ce of the character, we bail. michael@0: * Otherwise we try normalizing the first composite michael@0: * character and find the immediate decomposed character before the match to michael@0: * see if it is an non-ignorable accent. michael@0: * Now normalizing the first composite character is enough because we ensure michael@0: * that when the match is passed in here with extra beginning ces, the michael@0: * first or last ce that match has to occur within the first character. michael@0: * E.g. looking for \u0301 acute in \u01FA A ring above and acute, michael@0: * checkExtraMatchAccent should fail since there is a middle ring in \u01FA michael@0: * Note here that accents checking are slow and cautioned in the API docs. michael@0: * @param strsrch string search data michael@0: * @param start offset michael@0: * @param end offset michael@0: * @return TRUE if there are accents on either side of the match, michael@0: * FALSE otherwise michael@0: */ michael@0: static michael@0: UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, michael@0: int32_t end) michael@0: { michael@0: if (strsrch->pattern.hasPrefixAccents) { michael@0: UCollationElements *coleiter = strsrch->textIter; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: // we have been iterating forwards previously michael@0: uint32_t ignorable = TRUE; michael@0: int32_t firstce = strsrch->pattern.CE[0]; michael@0: michael@0: setColEIterOffset(coleiter, start); michael@0: int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); michael@0: if (U_FAILURE(status)) { michael@0: return TRUE; michael@0: } michael@0: while (ce != firstce) { michael@0: if (ce != UCOL_IGNORABLE) { michael@0: ignorable = FALSE; michael@0: } michael@0: ce = getCE(strsrch, ucol_next(coleiter, &status)); michael@0: if (U_FAILURE(status) || ce == UCOL_NULLORDER) { michael@0: return TRUE; michael@0: } michael@0: } michael@0: if (!ignorable && inNormBuf(coleiter)) { michael@0: // within normalization buffer, discontiguous handled here michael@0: return TRUE; michael@0: } michael@0: michael@0: // within text michael@0: int32_t temp = start; michael@0: // original code michael@0: // accent = (getFCD(strsrch->search->text, &temp, michael@0: // strsrch->search->textLength) michael@0: // >> SECOND_LAST_BYTE_SHIFT_); michael@0: // however this code does not work well with VC7 .net in release mode. michael@0: // maybe the inlines for getFCD combined with shifting has bugs in michael@0: // VC7. anyways this is a work around. michael@0: UBool accent = getFCD(strsrch->search->text, &temp, michael@0: strsrch->search->textLength) > 0xFF; michael@0: if (!accent) { michael@0: return checkExtraMatchAccents(strsrch, start, end, &status); michael@0: } michael@0: if (!ignorable) { michael@0: return TRUE; michael@0: } michael@0: if (start > 0) { michael@0: temp = start; michael@0: U16_BACK_1(strsrch->search->text, 0, temp); michael@0: if (getFCD(strsrch->search->text, &temp, michael@0: strsrch->search->textLength) & LAST_BYTE_MASK_) { michael@0: setColEIterOffset(coleiter, start); michael@0: ce = ucol_previous(coleiter, &status); michael@0: if (U_FAILURE(status) || michael@0: (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { michael@0: return TRUE; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: return FALSE; michael@0: } michael@0: michael@0: /** michael@0: * Used by exact matches, checks if there are accents bounding the match. michael@0: * Note this is the initial boundary check. If the potential match michael@0: * starts or ends with composite characters, the accents in those michael@0: * characters will be determined later. michael@0: * Not doing backwards iteration here, since discontiguos contraction for michael@0: * backwards collation element iterator, use up too many characters. michael@0: * E.g. looking for \u030A ring in \u01FA A ring above and acute, michael@0: * should fail since there is a acute at the end of \u01FA michael@0: * Note here that accents checking are slow and cautioned in the API docs. michael@0: * @param strsrch string search data michael@0: * @param start offset of match michael@0: * @param end end offset of the match michael@0: * @return TRUE if there are accents on either side of the match, michael@0: * FALSE otherwise michael@0: */ michael@0: static michael@0: UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, michael@0: int32_t end) michael@0: { michael@0: if (strsrch->pattern.hasSuffixAccents) { michael@0: const UChar *text = strsrch->search->text; michael@0: int32_t temp = end; michael@0: int32_t textlength = strsrch->search->textLength; michael@0: U16_BACK_1(text, 0, temp); michael@0: if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { michael@0: int32_t firstce = strsrch->pattern.CE[0]; michael@0: UCollationElements *coleiter = strsrch->textIter; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: int32_t ce; michael@0: setColEIterOffset(coleiter, start); michael@0: while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { michael@0: if (U_FAILURE(status) || ce == UCOL_NULLORDER) { michael@0: return TRUE; michael@0: } michael@0: } michael@0: int32_t count = 1; michael@0: while (count < strsrch->pattern.CELength) { michael@0: if (getCE(strsrch, ucol_next(coleiter, &status)) michael@0: == UCOL_IGNORABLE) { michael@0: // Thai can give an ignorable here. michael@0: count --; michael@0: } michael@0: if (U_FAILURE(status)) { michael@0: return TRUE; michael@0: } michael@0: count ++; michael@0: } michael@0: michael@0: ce = ucol_next(coleiter, &status); michael@0: if (U_FAILURE(status)) { michael@0: return TRUE; michael@0: } michael@0: if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { michael@0: ce = getCE(strsrch, ce); michael@0: } michael@0: if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { michael@0: if (ucol_getOffset(coleiter) <= end) { michael@0: return TRUE; michael@0: } michael@0: if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { michael@0: return TRUE; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: return FALSE; michael@0: } michael@0: #endif // #if BOYER_MOORE michael@0: michael@0: /** michael@0: * Checks if the offset runs out of the text string michael@0: * @param offset michael@0: * @param textlength of the text string michael@0: * @return TRUE if offset is out of bounds, FALSE otherwise michael@0: */ michael@0: static michael@0: inline UBool isOutOfBounds(int32_t textlength, int32_t offset) michael@0: { michael@0: return offset < 0 || offset > textlength; michael@0: } michael@0: michael@0: /** michael@0: * Checks for identical match michael@0: * @param strsrch string search data michael@0: * @param start offset of possible match michael@0: * @param end offset of possible match michael@0: * @return TRUE if identical match is found michael@0: */ michael@0: static michael@0: inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, michael@0: int32_t end) michael@0: { michael@0: if (strsrch->strength != UCOL_IDENTICAL) { michael@0: return TRUE; michael@0: } michael@0: michael@0: // Note: We could use Normalizer::compare() or similar, but for short strings michael@0: // which may not be in FCD it might be faster to just NFD them. michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UnicodeString t2, p2; michael@0: strsrch->nfd->normalize( michael@0: UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); michael@0: strsrch->nfd->normalize( michael@0: UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); michael@0: // return FALSE if NFD failed michael@0: return U_SUCCESS(status) && t2 == p2; michael@0: } michael@0: michael@0: #if BOYER_MOORE michael@0: /** michael@0: * Checks to see if the match is repeated michael@0: * @param strsrch string search data michael@0: * @param start new match start index michael@0: * @param end new match end index michael@0: * @return TRUE if the the match is repeated, FALSE otherwise michael@0: */ michael@0: static michael@0: inline UBool checkRepeatedMatch(UStringSearch *strsrch, michael@0: int32_t start, michael@0: int32_t end) michael@0: { michael@0: int32_t lastmatchindex = strsrch->search->matchedIndex; michael@0: UBool result; michael@0: if (lastmatchindex == USEARCH_DONE) { michael@0: return FALSE; michael@0: } michael@0: if (strsrch->search->isForwardSearching) { michael@0: result = start <= lastmatchindex; michael@0: } michael@0: else { michael@0: result = start >= lastmatchindex; michael@0: } michael@0: if (!result && !strsrch->search->isOverlap) { michael@0: if (strsrch->search->isForwardSearching) { michael@0: result = start < lastmatchindex + strsrch->search->matchedLength; michael@0: } michael@0: else { michael@0: result = end > lastmatchindex; michael@0: } michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Gets the collation element iterator's current offset. michael@0: * @param coleiter collation element iterator michael@0: * @param forwards flag TRUE if we are moving in th forwards direction michael@0: * @return current offset michael@0: */ michael@0: static michael@0: inline int32_t getColElemIterOffset(const UCollationElements *coleiter, michael@0: UBool forwards) michael@0: { michael@0: int32_t result = ucol_getOffset(coleiter); michael@0: // intricacies of the the backwards collation element iterator michael@0: if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { michael@0: result ++; michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Checks match for contraction. michael@0: * If the match ends with a partial contraction we fail. michael@0: * If the match starts too far off (because of backwards iteration) we try to michael@0: * chip off the extra characters depending on whether a breakiterator has michael@0: * been used. michael@0: * Internal method, error assumed to be success, caller has to check status michael@0: * before calling this method. michael@0: * @param strsrch string search data michael@0: * @param start offset of potential match, to be modified if necessary michael@0: * @param end offset of potential match, to be modified if necessary michael@0: * @param status output error status if any michael@0: * @return TRUE if match passes the contraction test, FALSE otherwise michael@0: */ michael@0: michael@0: static michael@0: UBool checkNextExactContractionMatch(UStringSearch *strsrch, michael@0: int32_t *start, michael@0: int32_t *end, UErrorCode *status) michael@0: { michael@0: UCollationElements *coleiter = strsrch->textIter; michael@0: int32_t textlength = strsrch->search->textLength; michael@0: int32_t temp = *start; michael@0: const UCollator *collator = strsrch->collator; michael@0: const UChar *text = strsrch->search->text; michael@0: // This part checks if either ends of the match contains potential michael@0: // contraction. If so we'll have to iterate through them michael@0: // The start contraction needs to be checked since ucol_previous dumps michael@0: // all characters till the first safe character into the buffer. michael@0: // *start + 1 is used to test for the unsafe characters instead of *start michael@0: // because ucol_prev takes all unsafe characters till the first safe michael@0: // character ie *start. so by testing *start + 1, we can estimate if michael@0: // excess prefix characters has been included in the potential search michael@0: // results. michael@0: if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || michael@0: (*start + 1 < textlength michael@0: && ucol_unsafeCP(text[*start + 1], collator))) { michael@0: int32_t expansion = getExpansionPrefix(coleiter); michael@0: UBool expandflag = expansion > 0; michael@0: setColEIterOffset(coleiter, *start); michael@0: while (expansion > 0) { michael@0: // getting rid of the redundant ce, caused by setOffset. michael@0: // since backward contraction/expansion may have extra ces if we michael@0: // are in the normalization buffer, hasAccentsBeforeMatch would michael@0: // have taken care of it. michael@0: // E.g. the character \u01FA will have an expansion of 3, but if michael@0: // we are only looking for acute and ring \u030A and \u0301, we'll michael@0: // have to skip the first ce in the expansion buffer. michael@0: ucol_next(coleiter, status); michael@0: if (U_FAILURE(*status)) { michael@0: return FALSE; michael@0: } michael@0: if (ucol_getOffset(coleiter) != temp) { michael@0: *start = temp; michael@0: temp = ucol_getOffset(coleiter); michael@0: } michael@0: expansion --; michael@0: } michael@0: michael@0: int32_t *patternce = strsrch->pattern.CE; michael@0: int32_t patterncelength = strsrch->pattern.CELength; michael@0: int32_t count = 0; michael@0: while (count < patterncelength) { michael@0: int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); michael@0: if (ce == UCOL_IGNORABLE) { michael@0: continue; michael@0: } michael@0: if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { michael@0: *start = temp; michael@0: temp = ucol_getOffset(coleiter); michael@0: } michael@0: if (U_FAILURE(*status) || ce != patternce[count]) { michael@0: (*end) ++; michael@0: *end = getNextUStringSearchBaseOffset(strsrch, *end); michael@0: return FALSE; michael@0: } michael@0: count ++; michael@0: } michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: /** michael@0: * Checks and sets the match information if found. michael@0: * Checks michael@0: *