intl/icu/source/i18n/usrchimp.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2001-2011 IBM and others. All rights reserved.
michael@0 4 **********************************************************************
michael@0 5 * Date Name Description
michael@0 6 * 08/13/2001 synwee Creation.
michael@0 7 **********************************************************************
michael@0 8 */
michael@0 9 #ifndef USRCHIMP_H
michael@0 10 #define USRCHIMP_H
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13
michael@0 14 #if !UCONFIG_NO_COLLATION
michael@0 15
michael@0 16 #include "unicode/normalizer2.h"
michael@0 17 #include "unicode/ucol.h"
michael@0 18 #include "unicode/ucoleitr.h"
michael@0 19 #include "unicode/ubrk.h"
michael@0 20
michael@0 21 #define INITIAL_ARRAY_SIZE_ 256
michael@0 22 #define MAX_TABLE_SIZE_ 257
michael@0 23
michael@0 24 struct USearch {
michael@0 25 // required since collation element iterator does not have a getText API
michael@0 26 const UChar *text;
michael@0 27 int32_t textLength; // exact length
michael@0 28 UBool isOverlap;
michael@0 29 UBool isCanonicalMatch;
michael@0 30 int16_t elementComparisonType;
michael@0 31 UBreakIterator *internalBreakIter; //internal character breakiterator
michael@0 32 UBreakIterator *breakIter;
michael@0 33 // value USEARCH_DONE is the default value
michael@0 34 // if we are not at the start of the text or the end of the text,
michael@0 35 // depending on the iteration direction and matchedIndex is USEARCH_DONE
michael@0 36 // it means that we can't find any more matches in that particular direction
michael@0 37 int32_t matchedIndex;
michael@0 38 int32_t matchedLength;
michael@0 39 UBool isForwardSearching;
michael@0 40 UBool reset;
michael@0 41 };
michael@0 42
michael@0 43 struct UPattern {
michael@0 44 const UChar *text;
michael@0 45 int32_t textLength; // exact length
michael@0 46 // length required for backwards ce comparison
michael@0 47 int32_t CELength;
michael@0 48 int32_t *CE;
michael@0 49 int32_t CEBuffer[INITIAL_ARRAY_SIZE_];
michael@0 50 int32_t PCELength;
michael@0 51 int64_t *PCE;
michael@0 52 int64_t PCEBuffer[INITIAL_ARRAY_SIZE_];
michael@0 53 UBool hasPrefixAccents;
michael@0 54 UBool hasSuffixAccents;
michael@0 55 int16_t defaultShiftSize;
michael@0 56 int16_t shift[MAX_TABLE_SIZE_];
michael@0 57 int16_t backShift[MAX_TABLE_SIZE_];
michael@0 58 };
michael@0 59
michael@0 60 struct UStringSearch {
michael@0 61 struct USearch *search;
michael@0 62 struct UPattern pattern;
michael@0 63 const UCollator *collator;
michael@0 64 const icu::Normalizer2 *nfd;
michael@0 65 // positions within the collation element iterator is used to determine
michael@0 66 // if we are at the start of the text.
michael@0 67 UCollationElements *textIter;
michael@0 68 // utility collation element, used throughout program for temporary
michael@0 69 // iteration.
michael@0 70 UCollationElements *utilIter;
michael@0 71 UBool ownCollator;
michael@0 72 UCollationStrength strength;
michael@0 73 uint32_t ceMask;
michael@0 74 uint32_t variableTop;
michael@0 75 UBool toShift;
michael@0 76 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
michael@0 77 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
michael@0 78 };
michael@0 79
michael@0 80 /**
michael@0 81 * Exact matches without checking for the ends for extra accents.
michael@0 82 * The match after the position within the collation element iterator is to be
michael@0 83 * found.
michael@0 84 * After a match is found the offset in the collation element iterator will be
michael@0 85 * shifted to the start of the match.
michael@0 86 * Implementation note:
michael@0 87 * For tertiary we can't use the collator->tertiaryMask, that is a
michael@0 88 * preprocessed mask that takes into account case options. since we are only
michael@0 89 * concerned with exact matches, we don't need that.
michael@0 90 * Alternate handling - since only the 16 most significant digits is only used,
michael@0 91 * we can safely do a compare without masking if the ce is a variable, we mask
michael@0 92 * and get only the primary values no shifting to quartenary is required since
michael@0 93 * all primary values less than variabletop will need to be masked off anyway.
michael@0 94 * If the end character is composite and the pattern ce does not match the text
michael@0 95 * ce, we skip it until we find a match in the end composite character or when
michael@0 96 * it has passed the character. This is so that we can match pattern "a" with
michael@0 97 * the text "\u00e6"
michael@0 98 * @param strsrch string search data
michael@0 99 * @param status error status if any
michael@0 100 * @return TRUE if an exact match is found, FALSE otherwise
michael@0 101 */
michael@0 102 U_CFUNC
michael@0 103 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
michael@0 104
michael@0 105 /**
michael@0 106 * Canonical matches.
michael@0 107 * According to the definition, matches found here will include the whole span
michael@0 108 * of beginning and ending accents if it overlaps that region.
michael@0 109 * @param strsrch string search data
michael@0 110 * @param status error status if any
michael@0 111 * @return TRUE if a canonical match is found, FALSE otherwise
michael@0 112 */
michael@0 113 U_CFUNC
michael@0 114 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
michael@0 115
michael@0 116 /**
michael@0 117 * Gets the previous match.
michael@0 118 * Comments follows from handleNextExact
michael@0 119 * @param strsrch string search data
michael@0 120 * @param status error status if any
michael@0 121 * @return True if a exact math is found, FALSE otherwise.
michael@0 122 */
michael@0 123 U_CFUNC
michael@0 124 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
michael@0 125
michael@0 126 /**
michael@0 127 * Canonical matches.
michael@0 128 * According to the definition, matches found here will include the whole span
michael@0 129 * of beginning and ending accents if it overlaps that region.
michael@0 130 * @param strsrch string search data
michael@0 131 * @param status error status if any
michael@0 132 * @return TRUE if a canonical match is found, FALSE otherwise
michael@0 133 */
michael@0 134 U_CFUNC
michael@0 135 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
michael@0 136 UErrorCode *status);
michael@0 137
michael@0 138 #endif /* #if !UCONFIG_NO_COLLATION */
michael@0 139
michael@0 140 #endif

mercurial