michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2001-2011 IBM and others. All rights reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 08/13/2001 synwee Creation. michael@0: ********************************************************************** michael@0: */ michael@0: #ifndef USRCHIMP_H michael@0: #define USRCHIMP_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION michael@0: michael@0: #include "unicode/normalizer2.h" michael@0: #include "unicode/ucol.h" michael@0: #include "unicode/ucoleitr.h" michael@0: #include "unicode/ubrk.h" michael@0: michael@0: #define INITIAL_ARRAY_SIZE_ 256 michael@0: #define MAX_TABLE_SIZE_ 257 michael@0: michael@0: struct USearch { michael@0: // required since collation element iterator does not have a getText API michael@0: const UChar *text; michael@0: int32_t textLength; // exact length michael@0: UBool isOverlap; michael@0: UBool isCanonicalMatch; michael@0: int16_t elementComparisonType; michael@0: UBreakIterator *internalBreakIter; //internal character breakiterator michael@0: UBreakIterator *breakIter; michael@0: // value USEARCH_DONE is the default value michael@0: // if we are not at the start of the text or the end of the text, michael@0: // depending on the iteration direction and matchedIndex is USEARCH_DONE michael@0: // it means that we can't find any more matches in that particular direction michael@0: int32_t matchedIndex; michael@0: int32_t matchedLength; michael@0: UBool isForwardSearching; michael@0: UBool reset; michael@0: }; michael@0: michael@0: struct UPattern { michael@0: const UChar *text; michael@0: int32_t textLength; // exact length michael@0: // length required for backwards ce comparison michael@0: int32_t CELength; michael@0: int32_t *CE; michael@0: int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; michael@0: int32_t PCELength; michael@0: int64_t *PCE; michael@0: int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; michael@0: UBool hasPrefixAccents; michael@0: UBool hasSuffixAccents; michael@0: int16_t defaultShiftSize; michael@0: int16_t shift[MAX_TABLE_SIZE_]; michael@0: int16_t backShift[MAX_TABLE_SIZE_]; michael@0: }; michael@0: michael@0: struct UStringSearch { michael@0: struct USearch *search; michael@0: struct UPattern pattern; michael@0: const UCollator *collator; michael@0: const icu::Normalizer2 *nfd; michael@0: // positions within the collation element iterator is used to determine michael@0: // if we are at the start of the text. michael@0: UCollationElements *textIter; michael@0: // utility collation element, used throughout program for temporary michael@0: // iteration. michael@0: UCollationElements *utilIter; michael@0: UBool ownCollator; michael@0: UCollationStrength strength; michael@0: uint32_t ceMask; michael@0: uint32_t variableTop; michael@0: UBool toShift; michael@0: UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; michael@0: UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; michael@0: }; michael@0: michael@0: /** michael@0: * Exact matches without checking for the ends for extra accents. michael@0: * The match after the position within the collation element iterator is to be michael@0: * found. michael@0: * After a match is found the offset in the collation element iterator will be michael@0: * shifted to the start of the match. michael@0: * Implementation note: michael@0: * For tertiary we can't use the collator->tertiaryMask, that is a michael@0: * preprocessed mask that takes into account case options. since we are only michael@0: * concerned with exact matches, we don't need that. michael@0: * Alternate handling - since only the 16 most significant digits is only used, michael@0: * we can safely do a compare without masking if the ce is a variable, we mask michael@0: * and get only the primary values no shifting to quartenary is required since michael@0: * all primary values less than variabletop will need to be masked off anyway. michael@0: * If the end character is composite and the pattern ce does not match the text michael@0: * ce, we skip it until we find a match in the end composite character or when michael@0: * it has passed the character. This is so that we can match pattern "a" with michael@0: * the text "\u00e6" michael@0: * @param strsrch string search data michael@0: * @param status error status if any michael@0: * @return TRUE if an exact match is found, FALSE otherwise michael@0: */ michael@0: U_CFUNC michael@0: UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); michael@0: michael@0: /** michael@0: * Canonical matches. michael@0: * According to the definition, matches found here will include the whole span michael@0: * of beginning and ending accents if it overlaps that region. michael@0: * @param strsrch string search data michael@0: * @param status error status if any michael@0: * @return TRUE if a canonical match is found, FALSE otherwise michael@0: */ michael@0: U_CFUNC michael@0: UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); michael@0: michael@0: /** michael@0: * Gets the previous match. michael@0: * Comments follows from handleNextExact michael@0: * @param strsrch string search data michael@0: * @param status error status if any michael@0: * @return True if a exact math is found, FALSE otherwise. michael@0: */ michael@0: U_CFUNC michael@0: UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); michael@0: michael@0: /** michael@0: * Canonical matches. michael@0: * According to the definition, matches found here will include the whole span michael@0: * of beginning and ending accents if it overlaps that region. michael@0: * @param strsrch string search data michael@0: * @param status error status if any michael@0: * @return TRUE if a canonical match is found, FALSE otherwise michael@0: */ michael@0: U_CFUNC michael@0: UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, michael@0: UErrorCode *status); michael@0: michael@0: #endif /* #if !UCONFIG_NO_COLLATION */ michael@0: michael@0: #endif