| |
1 /* |
| |
2 ********************************************************************** |
| |
3 * Copyright (C) 2001-2011 IBM and others. All rights reserved. |
| |
4 ********************************************************************** |
| |
5 * Date Name Description |
| |
6 * 08/13/2001 synwee Creation. |
| |
7 ********************************************************************** |
| |
8 */ |
| |
9 #ifndef USRCHIMP_H |
| |
10 #define USRCHIMP_H |
| |
11 |
| |
12 #include "unicode/utypes.h" |
| |
13 |
| |
14 #if !UCONFIG_NO_COLLATION |
| |
15 |
| |
16 #include "unicode/normalizer2.h" |
| |
17 #include "unicode/ucol.h" |
| |
18 #include "unicode/ucoleitr.h" |
| |
19 #include "unicode/ubrk.h" |
| |
20 |
| |
21 #define INITIAL_ARRAY_SIZE_ 256 |
| |
22 #define MAX_TABLE_SIZE_ 257 |
| |
23 |
| |
24 struct USearch { |
| |
25 // required since collation element iterator does not have a getText API |
| |
26 const UChar *text; |
| |
27 int32_t textLength; // exact length |
| |
28 UBool isOverlap; |
| |
29 UBool isCanonicalMatch; |
| |
30 int16_t elementComparisonType; |
| |
31 UBreakIterator *internalBreakIter; //internal character breakiterator |
| |
32 UBreakIterator *breakIter; |
| |
33 // value USEARCH_DONE is the default value |
| |
34 // if we are not at the start of the text or the end of the text, |
| |
35 // depending on the iteration direction and matchedIndex is USEARCH_DONE |
| |
36 // it means that we can't find any more matches in that particular direction |
| |
37 int32_t matchedIndex; |
| |
38 int32_t matchedLength; |
| |
39 UBool isForwardSearching; |
| |
40 UBool reset; |
| |
41 }; |
| |
42 |
| |
43 struct UPattern { |
| |
44 const UChar *text; |
| |
45 int32_t textLength; // exact length |
| |
46 // length required for backwards ce comparison |
| |
47 int32_t CELength; |
| |
48 int32_t *CE; |
| |
49 int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; |
| |
50 int32_t PCELength; |
| |
51 int64_t *PCE; |
| |
52 int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; |
| |
53 UBool hasPrefixAccents; |
| |
54 UBool hasSuffixAccents; |
| |
55 int16_t defaultShiftSize; |
| |
56 int16_t shift[MAX_TABLE_SIZE_]; |
| |
57 int16_t backShift[MAX_TABLE_SIZE_]; |
| |
58 }; |
| |
59 |
| |
60 struct UStringSearch { |
| |
61 struct USearch *search; |
| |
62 struct UPattern pattern; |
| |
63 const UCollator *collator; |
| |
64 const icu::Normalizer2 *nfd; |
| |
65 // positions within the collation element iterator is used to determine |
| |
66 // if we are at the start of the text. |
| |
67 UCollationElements *textIter; |
| |
68 // utility collation element, used throughout program for temporary |
| |
69 // iteration. |
| |
70 UCollationElements *utilIter; |
| |
71 UBool ownCollator; |
| |
72 UCollationStrength strength; |
| |
73 uint32_t ceMask; |
| |
74 uint32_t variableTop; |
| |
75 UBool toShift; |
| |
76 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; |
| |
77 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; |
| |
78 }; |
| |
79 |
| |
80 /** |
| |
81 * Exact matches without checking for the ends for extra accents. |
| |
82 * The match after the position within the collation element iterator is to be |
| |
83 * found. |
| |
84 * After a match is found the offset in the collation element iterator will be |
| |
85 * shifted to the start of the match. |
| |
86 * Implementation note: |
| |
87 * For tertiary we can't use the collator->tertiaryMask, that is a |
| |
88 * preprocessed mask that takes into account case options. since we are only |
| |
89 * concerned with exact matches, we don't need that. |
| |
90 * Alternate handling - since only the 16 most significant digits is only used, |
| |
91 * we can safely do a compare without masking if the ce is a variable, we mask |
| |
92 * and get only the primary values no shifting to quartenary is required since |
| |
93 * all primary values less than variabletop will need to be masked off anyway. |
| |
94 * If the end character is composite and the pattern ce does not match the text |
| |
95 * ce, we skip it until we find a match in the end composite character or when |
| |
96 * it has passed the character. This is so that we can match pattern "a" with |
| |
97 * the text "\u00e6" |
| |
98 * @param strsrch string search data |
| |
99 * @param status error status if any |
| |
100 * @return TRUE if an exact match is found, FALSE otherwise |
| |
101 */ |
| |
102 U_CFUNC |
| |
103 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); |
| |
104 |
| |
105 /** |
| |
106 * Canonical matches. |
| |
107 * According to the definition, matches found here will include the whole span |
| |
108 * of beginning and ending accents if it overlaps that region. |
| |
109 * @param strsrch string search data |
| |
110 * @param status error status if any |
| |
111 * @return TRUE if a canonical match is found, FALSE otherwise |
| |
112 */ |
| |
113 U_CFUNC |
| |
114 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); |
| |
115 |
| |
116 /** |
| |
117 * Gets the previous match. |
| |
118 * Comments follows from handleNextExact |
| |
119 * @param strsrch string search data |
| |
120 * @param status error status if any |
| |
121 * @return True if a exact math is found, FALSE otherwise. |
| |
122 */ |
| |
123 U_CFUNC |
| |
124 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); |
| |
125 |
| |
126 /** |
| |
127 * Canonical matches. |
| |
128 * According to the definition, matches found here will include the whole span |
| |
129 * of beginning and ending accents if it overlaps that region. |
| |
130 * @param strsrch string search data |
| |
131 * @param status error status if any |
| |
132 * @return TRUE if a canonical match is found, FALSE otherwise |
| |
133 */ |
| |
134 U_CFUNC |
| |
135 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, |
| |
136 UErrorCode *status); |
| |
137 |
| |
138 #endif /* #if !UCONFIG_NO_COLLATION */ |
| |
139 |
| |
140 #endif |