|
1 /* |
|
2 * Copyright (C) 2001-2011, International Business Machines Corporation |
|
3 * and others. All Rights Reserved. |
|
4 ********************************************************************** |
|
5 * Date Name Description |
|
6 * 07/23/01 aliu Creation. |
|
7 ********************************************************************** |
|
8 */ |
|
9 #ifndef STRMATCH_H |
|
10 #define STRMATCH_H |
|
11 |
|
12 #include "unicode/utypes.h" |
|
13 |
|
14 #if !UCONFIG_NO_TRANSLITERATION |
|
15 |
|
16 #include "unicode/unistr.h" |
|
17 #include "unicode/unifunct.h" |
|
18 #include "unicode/unimatch.h" |
|
19 #include "unicode/unirepl.h" |
|
20 |
|
21 U_NAMESPACE_BEGIN |
|
22 |
|
23 class TransliterationRuleData; |
|
24 |
|
25 /** |
|
26 * An object that matches a fixed input string, implementing the |
|
27 * UnicodeMatcher API. This object also implements the |
|
28 * UnicodeReplacer API, allowing it to emit the matched text as |
|
29 * output. Since the match text may contain flexible match elements, |
|
30 * such as UnicodeSets, the emitted text is not the match pattern, but |
|
31 * instead a substring of the actual matched text. Following |
|
32 * convention, the output text is the leftmost match seen up to this |
|
33 * point. |
|
34 * |
|
35 * A StringMatcher may represent a segment, in which case it has a |
|
36 * positive segment number. This affects how the matcher converts |
|
37 * itself to a pattern but does not otherwise affect its function. |
|
38 * |
|
39 * A StringMatcher that is not a segment should not be used as a |
|
40 * UnicodeReplacer. |
|
41 */ |
|
42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { |
|
43 |
|
44 public: |
|
45 |
|
46 /** |
|
47 * Construct a matcher that matches the given pattern string. |
|
48 * @param string the pattern to be matched, possibly containing |
|
49 * stand-ins that represent nested UnicodeMatcher objects. |
|
50 * @param start inclusive start index of text to be replaced |
|
51 * @param limit exclusive end index of text to be replaced; |
|
52 * must be greater than or equal to start |
|
53 * @param segmentNum the segment number from 1..n, or 0 if this is |
|
54 * not a segment. |
|
55 * @param data context object mapping stand-ins to |
|
56 * UnicodeMatcher objects. |
|
57 */ |
|
58 StringMatcher(const UnicodeString& string, |
|
59 int32_t start, |
|
60 int32_t limit, |
|
61 int32_t segmentNum, |
|
62 const TransliterationRuleData& data); |
|
63 |
|
64 /** |
|
65 * Copy constructor |
|
66 * @param o the object to be copied. |
|
67 */ |
|
68 StringMatcher(const StringMatcher& o); |
|
69 |
|
70 /** |
|
71 * Destructor |
|
72 */ |
|
73 virtual ~StringMatcher(); |
|
74 |
|
75 /** |
|
76 * Implement UnicodeFunctor |
|
77 * @return a copy of the object. |
|
78 */ |
|
79 virtual UnicodeFunctor* clone() const; |
|
80 |
|
81 /** |
|
82 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer |
|
83 * and return the pointer. |
|
84 * @return the UnicodeMatcher point. |
|
85 */ |
|
86 virtual UnicodeMatcher* toMatcher() const; |
|
87 |
|
88 /** |
|
89 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer |
|
90 * and return the pointer. |
|
91 * @return the UnicodeReplacer pointer. |
|
92 */ |
|
93 virtual UnicodeReplacer* toReplacer() const; |
|
94 |
|
95 /** |
|
96 * Implement UnicodeMatcher |
|
97 * @param text the text to be matched |
|
98 * @param offset on input, the index into text at which to begin |
|
99 * matching. On output, the limit of the matched text. The |
|
100 * number of matched characters is the output value of offset |
|
101 * minus the input value. Offset should always point to the |
|
102 * HIGH SURROGATE (leading code unit) of a pair of surrogates, |
|
103 * both on entry and upon return. |
|
104 * @param limit the limit index of text to be matched. Greater |
|
105 * than offset for a forward direction match, less than offset for |
|
106 * a backward direction match. The last character to be |
|
107 * considered for matching will be text.charAt(limit-1) in the |
|
108 * forward direction or text.charAt(limit+1) in the backward |
|
109 * direction. |
|
110 * @param incremental if TRUE, then assume further characters may |
|
111 * be inserted at limit and check for partial matching. Otherwise |
|
112 * assume the text as given is complete. |
|
113 * @return a match degree value indicating a full match, a partial |
|
114 * match, or a mismatch. If incremental is FALSE then |
|
115 * U_PARTIAL_MATCH should never be returned. |
|
116 */ |
|
117 virtual UMatchDegree matches(const Replaceable& text, |
|
118 int32_t& offset, |
|
119 int32_t limit, |
|
120 UBool incremental); |
|
121 |
|
122 /** |
|
123 * Implement UnicodeMatcher |
|
124 * @param result Output param to receive the pattern. |
|
125 * @param escapeUnprintable if True then escape the unprintable characters. |
|
126 * @return A reference to 'result'. |
|
127 */ |
|
128 virtual UnicodeString& toPattern(UnicodeString& result, |
|
129 UBool escapeUnprintable = FALSE) const; |
|
130 |
|
131 /** |
|
132 * Implement UnicodeMatcher |
|
133 * Returns TRUE if this matcher will match a character c, where c |
|
134 * & 0xFF == v, at offset, in the forward direction (with limit > |
|
135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for |
|
136 * indexing. |
|
137 * @param v the given value |
|
138 * @return TRUE if this matcher will match a character c, |
|
139 * where c & 0xFF == v |
|
140 */ |
|
141 virtual UBool matchesIndexValue(uint8_t v) const; |
|
142 |
|
143 /** |
|
144 * Implement UnicodeMatcher |
|
145 */ |
|
146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; |
|
147 |
|
148 /** |
|
149 * Implement UnicodeFunctor |
|
150 */ |
|
151 virtual void setData(const TransliterationRuleData*); |
|
152 |
|
153 /** |
|
154 * Replace characters in 'text' from 'start' to 'limit' with the |
|
155 * output text of this object. Update the 'cursor' parameter to |
|
156 * give the cursor position and return the length of the |
|
157 * replacement text. |
|
158 * |
|
159 * @param text the text to be matched |
|
160 * @param start inclusive start index of text to be replaced |
|
161 * @param limit exclusive end index of text to be replaced; |
|
162 * must be greater than or equal to start |
|
163 * @param cursor output parameter for the cursor position. |
|
164 * Not all replacer objects will update this, but in a complete |
|
165 * tree of replacer objects, representing the entire output side |
|
166 * of a transliteration rule, at least one must update it. |
|
167 * @return the number of 16-bit code units in the text replacing |
|
168 * the characters at offsets start..(limit-1) in text |
|
169 */ |
|
170 virtual int32_t replace(Replaceable& text, |
|
171 int32_t start, |
|
172 int32_t limit, |
|
173 int32_t& cursor); |
|
174 |
|
175 /** |
|
176 * Returns a string representation of this replacer. If the |
|
177 * result of calling this function is passed to the appropriate |
|
178 * parser, typically TransliteratorParser, it will produce another |
|
179 * replacer that is equal to this one. |
|
180 * @param result the string to receive the pattern. Previous |
|
181 * contents will be deleted. |
|
182 * @param escapeUnprintable if TRUE then convert unprintable |
|
183 * character to their hex escape representations, \\uxxxx or |
|
184 * \\Uxxxxxxxx. Unprintable characters are defined by |
|
185 * Utility.isUnprintable(). |
|
186 * @return a reference to 'result'. |
|
187 */ |
|
188 virtual UnicodeString& toReplacerPattern(UnicodeString& result, |
|
189 UBool escapeUnprintable) const; |
|
190 |
|
191 /** |
|
192 * Remove any match data. This must be called before performing a |
|
193 * set of matches with this segment. |
|
194 */ |
|
195 void resetMatch(); |
|
196 |
|
197 /** |
|
198 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
|
199 */ |
|
200 virtual UClassID getDynamicClassID() const; |
|
201 |
|
202 /** |
|
203 * ICU "poor man's RTTI", returns a UClassID for this class. |
|
204 */ |
|
205 static UClassID U_EXPORT2 getStaticClassID(); |
|
206 |
|
207 /** |
|
208 * Union the set of all characters that may output by this object |
|
209 * into the given set. |
|
210 * @param toUnionTo the set into which to union the output characters |
|
211 */ |
|
212 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; |
|
213 |
|
214 private: |
|
215 |
|
216 /** |
|
217 * The text to be matched. |
|
218 */ |
|
219 UnicodeString pattern; |
|
220 |
|
221 /** |
|
222 * Context object that maps stand-ins to matcher and replacer |
|
223 * objects. |
|
224 */ |
|
225 const TransliterationRuleData* data; |
|
226 |
|
227 /** |
|
228 * The segment number, 1-based, or 0 if not a segment. |
|
229 */ |
|
230 int32_t segmentNumber; |
|
231 |
|
232 /** |
|
233 * Start offset, in the match text, of the <em>rightmost</em> |
|
234 * match. |
|
235 */ |
|
236 int32_t matchStart; |
|
237 |
|
238 /** |
|
239 * Limit offset, in the match text, of the <em>rightmost</em> |
|
240 * match. |
|
241 */ |
|
242 int32_t matchLimit; |
|
243 |
|
244 }; |
|
245 |
|
246 U_NAMESPACE_END |
|
247 |
|
248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
249 |
|
250 #endif |