|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2011, International Business Machines Corporation |
|
4 * and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 11/17/99 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 #ifndef RBT_PARS_H |
|
11 #define RBT_PARS_H |
|
12 |
|
13 #include "unicode/utypes.h" |
|
14 |
|
15 #if !UCONFIG_NO_TRANSLITERATION |
|
16 #ifdef __cplusplus |
|
17 |
|
18 #include "unicode/uobject.h" |
|
19 #include "unicode/parseerr.h" |
|
20 #include "unicode/unorm.h" |
|
21 #include "rbt.h" |
|
22 #include "hash.h" |
|
23 #include "uvector.h" |
|
24 |
|
25 U_NAMESPACE_BEGIN |
|
26 |
|
27 class TransliterationRuleData; |
|
28 class UnicodeFunctor; |
|
29 class ParseData; |
|
30 class RuleHalf; |
|
31 class ParsePosition; |
|
32 class StringMatcher; |
|
33 |
|
34 class TransliteratorParser : public UMemory { |
|
35 |
|
36 public: |
|
37 |
|
38 /** |
|
39 * A Vector of TransliterationRuleData objects, one for each discrete group |
|
40 * of rules in the rule set |
|
41 */ |
|
42 UVector dataVector; |
|
43 |
|
44 /** |
|
45 * PUBLIC data member. |
|
46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set |
|
47 */ |
|
48 UVector idBlockVector; |
|
49 |
|
50 /** |
|
51 * PUBLIC data member containing the parsed compound filter, if any. |
|
52 */ |
|
53 UnicodeSet* compoundFilter; |
|
54 |
|
55 private: |
|
56 |
|
57 /** |
|
58 * The current data object for which we are parsing rules |
|
59 */ |
|
60 TransliterationRuleData* curData; |
|
61 |
|
62 UTransDirection direction; |
|
63 |
|
64 /** |
|
65 * Parse error information. |
|
66 */ |
|
67 UParseError parseError; |
|
68 |
|
69 /** |
|
70 * Temporary symbol table used during parsing. |
|
71 */ |
|
72 ParseData* parseData; |
|
73 |
|
74 /** |
|
75 * Temporary vector of matcher variables. When parsing is complete, this |
|
76 * is copied into the array data.variables. As with data.variables, |
|
77 * element 0 corresponds to character data.variablesBase. |
|
78 */ |
|
79 UVector variablesVector; |
|
80 |
|
81 /** |
|
82 * Temporary table of variable names. When parsing is complete, this is |
|
83 * copied into data.variableNames. |
|
84 */ |
|
85 Hashtable variableNames; |
|
86 |
|
87 /** |
|
88 * String of standins for segments. Used during the parsing of a single |
|
89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds |
|
90 * to StringMatcher object segmentObjects.elementAt(0), etc. |
|
91 */ |
|
92 UnicodeString segmentStandins; |
|
93 |
|
94 /** |
|
95 * Vector of StringMatcher objects for segments. Used during the |
|
96 * parsing of a single rule. |
|
97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds |
|
98 * to StringMatcher object segmentObjects.elementAt(0), etc. |
|
99 */ |
|
100 UVector segmentObjects; |
|
101 |
|
102 /** |
|
103 * The next available stand-in for variables. This starts at some point in |
|
104 * the private use area (discovered dynamically) and increments up toward |
|
105 * <code>variableLimit</code>. At any point during parsing, available |
|
106 * variables are <code>variableNext..variableLimit-1</code>. |
|
107 */ |
|
108 UChar variableNext; |
|
109 |
|
110 /** |
|
111 * The last available stand-in for variables. This is discovered |
|
112 * dynamically. At any point during parsing, available variables are |
|
113 * <code>variableNext..variableLimit-1</code>. |
|
114 */ |
|
115 UChar variableLimit; |
|
116 |
|
117 /** |
|
118 * When we encounter an undefined variable, we do not immediately signal |
|
119 * an error, in case we are defining this variable, e.g., "$a = [a-z];". |
|
120 * Instead, we save the name of the undefined variable, and substitute |
|
121 * in the placeholder char variableLimit - 1, and decrement |
|
122 * variableLimit. |
|
123 */ |
|
124 UnicodeString undefinedVariableName; |
|
125 |
|
126 /** |
|
127 * The stand-in character for the 'dot' set, represented by '.' in |
|
128 * patterns. This is allocated the first time it is needed, and |
|
129 * reused thereafter. |
|
130 */ |
|
131 UChar dotStandIn; |
|
132 |
|
133 public: |
|
134 |
|
135 /** |
|
136 * Constructor. |
|
137 */ |
|
138 TransliteratorParser(UErrorCode &statusReturn); |
|
139 |
|
140 /** |
|
141 * Destructor. |
|
142 */ |
|
143 ~TransliteratorParser(); |
|
144 |
|
145 /** |
|
146 * Parse the given string as a sequence of rules, separated by newline |
|
147 * characters ('\n'), and cause this object to implement those rules. Any |
|
148 * previous rules are discarded. Typically this method is called exactly |
|
149 * once after construction. |
|
150 * |
|
151 * Parse the given rules, in the given direction. After this call |
|
152 * returns, query the public data members for results. The caller |
|
153 * owns the 'data' and 'compoundFilter' data members after this |
|
154 * call returns. |
|
155 * @param rules rules, separated by ';' |
|
156 * @param direction either FORWARD or REVERSE. |
|
157 * @param pe Struct to recieve information on position |
|
158 * of error if an error is encountered |
|
159 * @param ec Output param set to success/failure code. |
|
160 */ |
|
161 void parse(const UnicodeString& rules, |
|
162 UTransDirection direction, |
|
163 UParseError& pe, |
|
164 UErrorCode& ec); |
|
165 |
|
166 /** |
|
167 * Return the compound filter parsed by parse(). Caller owns result. |
|
168 * @return the compound filter parsed by parse(). |
|
169 */ |
|
170 UnicodeSet* orphanCompoundFilter(); |
|
171 |
|
172 private: |
|
173 |
|
174 /** |
|
175 * Return a representation of this transliterator as source rules. |
|
176 * @param rules Output param to receive the rules. |
|
177 * @param direction either FORWARD or REVERSE. |
|
178 */ |
|
179 void parseRules(const UnicodeString& rules, |
|
180 UTransDirection direction, |
|
181 UErrorCode& status); |
|
182 |
|
183 /** |
|
184 * MAIN PARSER. Parse the next rule in the given rule string, starting |
|
185 * at pos. Return the index after the last character parsed. Do not |
|
186 * parse characters at or after limit. |
|
187 * |
|
188 * Important: The character at pos must be a non-whitespace character |
|
189 * that is not the comment character. |
|
190 * |
|
191 * This method handles quoting, escaping, and whitespace removal. It |
|
192 * parses the end-of-rule character. It recognizes context and cursor |
|
193 * indicators. Once it does a lexical breakdown of the rule at pos, it |
|
194 * creates a rule object and adds it to our rule list. |
|
195 * @param rules Output param to receive the rules. |
|
196 * @param pos the starting position. |
|
197 * @param limit pointer past the last character of the rule. |
|
198 * @return the index after the last character parsed. |
|
199 */ |
|
200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
|
201 |
|
202 /** |
|
203 * Set the variable range to [start, end] (inclusive). |
|
204 * @param start the start value of the range. |
|
205 * @param end the end value of the range. |
|
206 */ |
|
207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status); |
|
208 |
|
209 /** |
|
210 * Assert that the given character is NOT within the variable range. |
|
211 * If it is, return FALSE. This is neccesary to ensure that the |
|
212 * variable range does not overlap characters used in a rule. |
|
213 * @param ch the given character. |
|
214 * @return True, if the given character is NOT within the variable range. |
|
215 */ |
|
216 UBool checkVariableRange(UChar32 ch) const; |
|
217 |
|
218 /** |
|
219 * Set the maximum backup to 'backup', in response to a pragma |
|
220 * statement. |
|
221 * @param backup the new value to be set. |
|
222 */ |
|
223 void pragmaMaximumBackup(int32_t backup); |
|
224 |
|
225 /** |
|
226 * Begin normalizing all rules using the given mode, in response |
|
227 * to a pragma statement. |
|
228 * @param mode the given mode. |
|
229 */ |
|
230 void pragmaNormalizeRules(UNormalizationMode mode); |
|
231 |
|
232 /** |
|
233 * Return true if the given rule looks like a pragma. |
|
234 * @param pos offset to the first non-whitespace character |
|
235 * of the rule. |
|
236 * @param limit pointer past the last character of the rule. |
|
237 * @return true if the given rule looks like a pragma. |
|
238 */ |
|
239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); |
|
240 |
|
241 /** |
|
242 * Parse a pragma. This method assumes resemblesPragma() has |
|
243 * already returned true. |
|
244 * @param pos offset to the first non-whitespace character |
|
245 * of the rule. |
|
246 * @param limit pointer past the last character of the rule. |
|
247 * @return the position index after the final ';' of the pragma, |
|
248 * or -1 on failure. |
|
249 */ |
|
250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
|
251 |
|
252 /** |
|
253 * Called by main parser upon syntax error. Search the rule string |
|
254 * for the probable end of the rule. Of course, if the error is that |
|
255 * the end of rule marker is missing, then the rule end will not be found. |
|
256 * In any case the rule start will be correctly reported. |
|
257 * @param parseErrorCode error code. |
|
258 * @param msg error description. |
|
259 * @param start position of first character of current rule. |
|
260 * @return start position of first character of current rule. |
|
261 */ |
|
262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, |
|
263 UErrorCode& status); |
|
264 |
|
265 /** |
|
266 * Parse a UnicodeSet out, store it, and return the stand-in character |
|
267 * used to represent it. |
|
268 * |
|
269 * @param rule the rule for UnicodeSet. |
|
270 * @param pos the position in pattern at which to start parsing. |
|
271 * @return the stand-in character used to represent it. |
|
272 */ |
|
273 UChar parseSet(const UnicodeString& rule, |
|
274 ParsePosition& pos, |
|
275 UErrorCode& status); |
|
276 |
|
277 /** |
|
278 * Generate and return a stand-in for a new UnicodeFunctor. Store |
|
279 * the matcher (adopt it). |
|
280 * @param adopted the UnicodeFunctor to be adopted. |
|
281 * @return a stand-in for a new UnicodeFunctor. |
|
282 */ |
|
283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); |
|
284 |
|
285 /** |
|
286 * Return the standin for segment seg (1-based). |
|
287 * @param seg the given segment. |
|
288 * @return the standIn character for the given segment. |
|
289 */ |
|
290 UChar getSegmentStandin(int32_t seg, UErrorCode& status); |
|
291 |
|
292 /** |
|
293 * Set the object for segment seg (1-based). |
|
294 * @param seg the given segment. |
|
295 * @param adopted the StringMatcher to be adopted. |
|
296 */ |
|
297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); |
|
298 |
|
299 /** |
|
300 * Return the stand-in for the dot set. It is allocated the first |
|
301 * time and reused thereafter. |
|
302 * @return the stand-in for the dot set. |
|
303 */ |
|
304 UChar getDotStandIn(UErrorCode& status); |
|
305 |
|
306 /** |
|
307 * Append the value of the given variable name to the given |
|
308 * UnicodeString. |
|
309 * @param name the variable name to be appended. |
|
310 * @param buf the given UnicodeString to append to. |
|
311 */ |
|
312 void appendVariableDef(const UnicodeString& name, |
|
313 UnicodeString& buf, |
|
314 UErrorCode& status); |
|
315 |
|
316 /** |
|
317 * Glue method to get around access restrictions in C++. |
|
318 */ |
|
319 /*static Transliterator* createBasicInstance(const UnicodeString& id, |
|
320 const UnicodeString* canonID);*/ |
|
321 |
|
322 friend class RuleHalf; |
|
323 |
|
324 // Disallowed methods; no impl. |
|
325 /** |
|
326 * Copy constructor |
|
327 */ |
|
328 TransliteratorParser(const TransliteratorParser&); |
|
329 |
|
330 /** |
|
331 * Assignment operator |
|
332 */ |
|
333 TransliteratorParser& operator=(const TransliteratorParser&); |
|
334 }; |
|
335 |
|
336 U_NAMESPACE_END |
|
337 |
|
338 #endif /* #ifdef __cplusplus */ |
|
339 |
|
340 /** |
|
341 * Strip/convert the following from the transliterator rules: |
|
342 * comments |
|
343 * newlines |
|
344 * white space at the beginning and end of a line |
|
345 * unescape \u notation |
|
346 * |
|
347 * The target must be equal in size as the source. |
|
348 * @internal |
|
349 */ |
|
350 U_CAPI int32_t |
|
351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); |
|
352 |
|
353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
354 |
|
355 #endif |