|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2002-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * file name: regex.h |
|
7 * encoding: US-ASCII |
|
8 * indentation:4 |
|
9 * |
|
10 * created on: 2002oct22 |
|
11 * created by: Andy Heninger |
|
12 * |
|
13 * ICU Regular Expressions, API for C++ |
|
14 */ |
|
15 |
|
16 #ifndef REGEX_H |
|
17 #define REGEX_H |
|
18 |
|
19 //#define REGEX_DEBUG |
|
20 |
|
21 /** |
|
22 * \file |
|
23 * \brief C++ API: Regular Expressions |
|
24 * |
|
25 * <h2>Regular Expression API</h2> |
|
26 * |
|
27 * <p>The ICU API for processing regular expressions consists of two classes, |
|
28 * <code>RegexPattern</code> and <code>RegexMatcher</code>. |
|
29 * <code>RegexPattern</code> objects represent a pre-processed, or compiled |
|
30 * regular expression. They are created from a regular expression pattern string, |
|
31 * and can be used to create <code>RegexMatcher</code> objects for the pattern.</p> |
|
32 * |
|
33 * <p>Class <code>RegexMatcher</code> bundles together a regular expression |
|
34 * pattern and a target string to which the search pattern will be applied. |
|
35 * <code>RegexMatcher</code> includes API for doing plain find or search |
|
36 * operations, for search and replace operations, and for obtaining detailed |
|
37 * information about bounds of a match. </p> |
|
38 * |
|
39 * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular |
|
40 * expression pattern strings application code can be simplified and the explicit |
|
41 * need for <code>RegexPattern</code> objects can usually be eliminated. |
|
42 * </p> |
|
43 */ |
|
44 |
|
45 #include "unicode/utypes.h" |
|
46 |
|
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
48 |
|
49 #include "unicode/uobject.h" |
|
50 #include "unicode/unistr.h" |
|
51 #include "unicode/utext.h" |
|
52 #include "unicode/parseerr.h" |
|
53 |
|
54 #include "unicode/uregex.h" |
|
55 |
|
56 // Forward Declarations |
|
57 |
|
58 U_NAMESPACE_BEGIN |
|
59 |
|
60 struct Regex8BitSet; |
|
61 class RegexCImpl; |
|
62 class RegexMatcher; |
|
63 class RegexPattern; |
|
64 struct REStackFrame; |
|
65 class RuleBasedBreakIterator; |
|
66 class UnicodeSet; |
|
67 class UVector; |
|
68 class UVector32; |
|
69 class UVector64; |
|
70 |
|
71 #ifndef U_HIDE_INTERNAL_API |
|
72 /** |
|
73 * RBBIPatternDump Debug function, displays the compiled form of a pattern. |
|
74 * @internal |
|
75 */ |
|
76 #ifdef REGEX_DEBUG |
|
77 U_INTERNAL void U_EXPORT2 |
|
78 RegexPatternDump(const RegexPattern *pat); |
|
79 #else |
|
80 #undef RegexPatternDump |
|
81 #define RegexPatternDump(pat) |
|
82 #endif |
|
83 #endif /* U_HIDE_INTERNAL_API */ |
|
84 |
|
85 |
|
86 |
|
87 /** |
|
88 * Class <code>RegexPattern</code> represents a compiled regular expression. It includes |
|
89 * factory methods for creating a RegexPattern object from the source (string) form |
|
90 * of a regular expression, methods for creating RegexMatchers that allow the pattern |
|
91 * to be applied to input text, and a few convenience methods for simple common |
|
92 * uses of regular expressions. |
|
93 * |
|
94 * <p>Class RegexPattern is not intended to be subclassed.</p> |
|
95 * |
|
96 * @stable ICU 2.4 |
|
97 */ |
|
98 class U_I18N_API RegexPattern: public UObject { |
|
99 public: |
|
100 |
|
101 /** |
|
102 * default constructor. Create a RegexPattern object that refers to no actual |
|
103 * pattern. Not normally needed; RegexPattern objects are usually |
|
104 * created using the factory method <code>compile()</code>. |
|
105 * |
|
106 * @stable ICU 2.4 |
|
107 */ |
|
108 RegexPattern(); |
|
109 |
|
110 /** |
|
111 * Copy Constructor. Create a new RegexPattern object that is equivalent |
|
112 * to the source object. |
|
113 * @param source the pattern object to be copied. |
|
114 * @stable ICU 2.4 |
|
115 */ |
|
116 RegexPattern(const RegexPattern &source); |
|
117 |
|
118 /** |
|
119 * Destructor. Note that a RegexPattern object must persist so long as any |
|
120 * RegexMatcher objects that were created from the RegexPattern are active. |
|
121 * @stable ICU 2.4 |
|
122 */ |
|
123 virtual ~RegexPattern(); |
|
124 |
|
125 /** |
|
126 * Comparison operator. Two RegexPattern objects are considered equal if they |
|
127 * were constructed from identical source patterns using the same match flag |
|
128 * settings. |
|
129 * @param that a RegexPattern object to compare with "this". |
|
130 * @return TRUE if the objects are equivalent. |
|
131 * @stable ICU 2.4 |
|
132 */ |
|
133 UBool operator==(const RegexPattern& that) const; |
|
134 |
|
135 /** |
|
136 * Comparison operator. Two RegexPattern objects are considered equal if they |
|
137 * were constructed from identical source patterns using the same match flag |
|
138 * settings. |
|
139 * @param that a RegexPattern object to compare with "this". |
|
140 * @return TRUE if the objects are different. |
|
141 * @stable ICU 2.4 |
|
142 */ |
|
143 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);} |
|
144 |
|
145 /** |
|
146 * Assignment operator. After assignment, this RegexPattern will behave identically |
|
147 * to the source object. |
|
148 * @stable ICU 2.4 |
|
149 */ |
|
150 RegexPattern &operator =(const RegexPattern &source); |
|
151 |
|
152 /** |
|
153 * Create an exact copy of this RegexPattern object. Since RegexPattern is not |
|
154 * intended to be subclasses, <code>clone()</code> and the copy construction are |
|
155 * equivalent operations. |
|
156 * @return the copy of this RegexPattern |
|
157 * @stable ICU 2.4 |
|
158 */ |
|
159 virtual RegexPattern *clone() const; |
|
160 |
|
161 |
|
162 /** |
|
163 * Compiles the regular expression in string form into a RegexPattern |
|
164 * object. These compile methods, rather than the constructors, are the usual |
|
165 * way that RegexPattern objects are created. |
|
166 * |
|
167 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
168 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
169 * back to their pattern, so premature deletion of the pattern is a |
|
170 * catastrophic error.</p> |
|
171 * |
|
172 * <p>All pattern match mode flags are set to their default values.</p> |
|
173 * |
|
174 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
175 * from a pattern string rather than separately compiling the pattern and |
|
176 * then creating a RegexMatcher object from the pattern.</p> |
|
177 * |
|
178 * @param regex The regular expression to be compiled. |
|
179 * @param pe Receives the position (line and column nubers) of any error |
|
180 * within the regular expression.) |
|
181 * @param status A reference to a UErrorCode to receive any errors. |
|
182 * @return A regexPattern object for the compiled pattern. |
|
183 * |
|
184 * @stable ICU 2.4 |
|
185 */ |
|
186 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, |
|
187 UParseError &pe, |
|
188 UErrorCode &status); |
|
189 |
|
190 /** |
|
191 * Compiles the regular expression in string form into a RegexPattern |
|
192 * object. These compile methods, rather than the constructors, are the usual |
|
193 * way that RegexPattern objects are created. |
|
194 * |
|
195 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
196 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
197 * back to their pattern, so premature deletion of the pattern is a |
|
198 * catastrophic error.</p> |
|
199 * |
|
200 * <p>All pattern match mode flags are set to their default values.</p> |
|
201 * |
|
202 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
203 * from a pattern string rather than separately compiling the pattern and |
|
204 * then creating a RegexMatcher object from the pattern.</p> |
|
205 * |
|
206 * @param regex The regular expression to be compiled. Note, the text referred |
|
207 * to by this UText must not be deleted during the lifetime of the |
|
208 * RegexPattern object or any RegexMatcher object created from it. |
|
209 * @param pe Receives the position (line and column nubers) of any error |
|
210 * within the regular expression.) |
|
211 * @param status A reference to a UErrorCode to receive any errors. |
|
212 * @return A regexPattern object for the compiled pattern. |
|
213 * |
|
214 * @stable ICU 4.6 |
|
215 */ |
|
216 static RegexPattern * U_EXPORT2 compile( UText *regex, |
|
217 UParseError &pe, |
|
218 UErrorCode &status); |
|
219 |
|
220 /** |
|
221 * Compiles the regular expression in string form into a RegexPattern |
|
222 * object using the specified match mode flags. These compile methods, |
|
223 * rather than the constructors, are the usual way that RegexPattern objects |
|
224 * are created. |
|
225 * |
|
226 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
227 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
228 * back to their pattern, so premature deletion of the pattern is a |
|
229 * catastrophic error.</p> |
|
230 * |
|
231 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
232 * from a pattern string instead of than separately compiling the pattern and |
|
233 * then creating a RegexMatcher object from the pattern.</p> |
|
234 * |
|
235 * @param regex The regular expression to be compiled. |
|
236 * @param flags The match mode flags to be used. |
|
237 * @param pe Receives the position (line and column numbers) of any error |
|
238 * within the regular expression.) |
|
239 * @param status A reference to a UErrorCode to receive any errors. |
|
240 * @return A regexPattern object for the compiled pattern. |
|
241 * |
|
242 * @stable ICU 2.4 |
|
243 */ |
|
244 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, |
|
245 uint32_t flags, |
|
246 UParseError &pe, |
|
247 UErrorCode &status); |
|
248 |
|
249 /** |
|
250 * Compiles the regular expression in string form into a RegexPattern |
|
251 * object using the specified match mode flags. These compile methods, |
|
252 * rather than the constructors, are the usual way that RegexPattern objects |
|
253 * are created. |
|
254 * |
|
255 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
256 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
257 * back to their pattern, so premature deletion of the pattern is a |
|
258 * catastrophic error.</p> |
|
259 * |
|
260 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
261 * from a pattern string instead of than separately compiling the pattern and |
|
262 * then creating a RegexMatcher object from the pattern.</p> |
|
263 * |
|
264 * @param regex The regular expression to be compiled. Note, the text referred |
|
265 * to by this UText must not be deleted during the lifetime of the |
|
266 * RegexPattern object or any RegexMatcher object created from it. |
|
267 * @param flags The match mode flags to be used. |
|
268 * @param pe Receives the position (line and column numbers) of any error |
|
269 * within the regular expression.) |
|
270 * @param status A reference to a UErrorCode to receive any errors. |
|
271 * @return A regexPattern object for the compiled pattern. |
|
272 * |
|
273 * @stable ICU 4.6 |
|
274 */ |
|
275 static RegexPattern * U_EXPORT2 compile( UText *regex, |
|
276 uint32_t flags, |
|
277 UParseError &pe, |
|
278 UErrorCode &status); |
|
279 |
|
280 /** |
|
281 * Compiles the regular expression in string form into a RegexPattern |
|
282 * object using the specified match mode flags. These compile methods, |
|
283 * rather than the constructors, are the usual way that RegexPattern objects |
|
284 * are created. |
|
285 * |
|
286 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
287 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
288 * back to their pattern, so premature deletion of the pattern is a |
|
289 * catastrophic error.</p> |
|
290 * |
|
291 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
292 * from a pattern string instead of than separately compiling the pattern and |
|
293 * then creating a RegexMatcher object from the pattern.</p> |
|
294 * |
|
295 * @param regex The regular expression to be compiled. |
|
296 * @param flags The match mode flags to be used. |
|
297 * @param status A reference to a UErrorCode to receive any errors. |
|
298 * @return A regexPattern object for the compiled pattern. |
|
299 * |
|
300 * @stable ICU 2.6 |
|
301 */ |
|
302 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, |
|
303 uint32_t flags, |
|
304 UErrorCode &status); |
|
305 |
|
306 /** |
|
307 * Compiles the regular expression in string form into a RegexPattern |
|
308 * object using the specified match mode flags. These compile methods, |
|
309 * rather than the constructors, are the usual way that RegexPattern objects |
|
310 * are created. |
|
311 * |
|
312 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher |
|
313 * objects created from the pattern are active. RegexMatchers keep a pointer |
|
314 * back to their pattern, so premature deletion of the pattern is a |
|
315 * catastrophic error.</p> |
|
316 * |
|
317 * <p>Note that it is often more convenient to construct a RegexMatcher directly |
|
318 * from a pattern string instead of than separately compiling the pattern and |
|
319 * then creating a RegexMatcher object from the pattern.</p> |
|
320 * |
|
321 * @param regex The regular expression to be compiled. Note, the text referred |
|
322 * to by this UText must not be deleted during the lifetime of the |
|
323 * RegexPattern object or any RegexMatcher object created from it. |
|
324 * @param flags The match mode flags to be used. |
|
325 * @param status A reference to a UErrorCode to receive any errors. |
|
326 * @return A regexPattern object for the compiled pattern. |
|
327 * |
|
328 * @stable ICU 4.6 |
|
329 */ |
|
330 static RegexPattern * U_EXPORT2 compile( UText *regex, |
|
331 uint32_t flags, |
|
332 UErrorCode &status); |
|
333 |
|
334 /** |
|
335 * Get the match mode flags that were used when compiling this pattern. |
|
336 * @return the match mode flags |
|
337 * @stable ICU 2.4 |
|
338 */ |
|
339 virtual uint32_t flags() const; |
|
340 |
|
341 /** |
|
342 * Creates a RegexMatcher that will match the given input against this pattern. The |
|
343 * RegexMatcher can then be used to perform match, find or replace operations |
|
344 * on the input. Note that a RegexPattern object must not be deleted while |
|
345 * RegexMatchers created from it still exist and might possibly be used again. |
|
346 * <p> |
|
347 * The matcher will retain a reference to the supplied input string, and all regexp |
|
348 * pattern matching operations happen directly on this original string. It is |
|
349 * critical that the string not be altered or deleted before use by the regular |
|
350 * expression operations is complete. |
|
351 * |
|
352 * @param input The input string to which the regular expression will be applied. |
|
353 * @param status A reference to a UErrorCode to receive any errors. |
|
354 * @return A RegexMatcher object for this pattern and input. |
|
355 * |
|
356 * @stable ICU 2.4 |
|
357 */ |
|
358 virtual RegexMatcher *matcher(const UnicodeString &input, |
|
359 UErrorCode &status) const; |
|
360 |
|
361 private: |
|
362 /** |
|
363 * Cause a compilation error if an application accidentally attempts to |
|
364 * create a matcher with a (UChar *) string as input rather than |
|
365 * a UnicodeString. Avoids a dangling reference to a temporary string. |
|
366 * <p> |
|
367 * To efficiently work with UChar *strings, wrap the data in a UnicodeString |
|
368 * using one of the aliasing constructors, such as |
|
369 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> |
|
370 * or in a UText, using |
|
371 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> |
|
372 * |
|
373 */ |
|
374 RegexMatcher *matcher(const UChar *input, |
|
375 UErrorCode &status) const; |
|
376 public: |
|
377 |
|
378 |
|
379 /** |
|
380 * Creates a RegexMatcher that will match against this pattern. The |
|
381 * RegexMatcher can be used to perform match, find or replace operations. |
|
382 * Note that a RegexPattern object must not be deleted while |
|
383 * RegexMatchers created from it still exist and might possibly be used again. |
|
384 * |
|
385 * @param status A reference to a UErrorCode to receive any errors. |
|
386 * @return A RegexMatcher object for this pattern and input. |
|
387 * |
|
388 * @stable ICU 2.6 |
|
389 */ |
|
390 virtual RegexMatcher *matcher(UErrorCode &status) const; |
|
391 |
|
392 |
|
393 /** |
|
394 * Test whether a string matches a regular expression. This convenience function |
|
395 * both compiles the regular expression and applies it in a single operation. |
|
396 * Note that if the same pattern needs to be applied repeatedly, this method will be |
|
397 * less efficient than creating and reusing a RegexMatcher object. |
|
398 * |
|
399 * @param regex The regular expression |
|
400 * @param input The string data to be matched |
|
401 * @param pe Receives the position of any syntax errors within the regular expression |
|
402 * @param status A reference to a UErrorCode to receive any errors. |
|
403 * @return True if the regular expression exactly matches the full input string. |
|
404 * |
|
405 * @stable ICU 2.4 |
|
406 */ |
|
407 static UBool U_EXPORT2 matches(const UnicodeString ®ex, |
|
408 const UnicodeString &input, |
|
409 UParseError &pe, |
|
410 UErrorCode &status); |
|
411 |
|
412 /** |
|
413 * Test whether a string matches a regular expression. This convenience function |
|
414 * both compiles the regular expression and applies it in a single operation. |
|
415 * Note that if the same pattern needs to be applied repeatedly, this method will be |
|
416 * less efficient than creating and reusing a RegexMatcher object. |
|
417 * |
|
418 * @param regex The regular expression |
|
419 * @param input The string data to be matched |
|
420 * @param pe Receives the position of any syntax errors within the regular expression |
|
421 * @param status A reference to a UErrorCode to receive any errors. |
|
422 * @return True if the regular expression exactly matches the full input string. |
|
423 * |
|
424 * @stable ICU 4.6 |
|
425 */ |
|
426 static UBool U_EXPORT2 matches(UText *regex, |
|
427 UText *input, |
|
428 UParseError &pe, |
|
429 UErrorCode &status); |
|
430 |
|
431 /** |
|
432 * Returns the regular expression from which this pattern was compiled. This method will work |
|
433 * even if the pattern was compiled from a UText. |
|
434 * |
|
435 * Note: If the pattern was originally compiled from a UText, and that UText was modified, |
|
436 * the returned string may no longer reflect the RegexPattern object. |
|
437 * @stable ICU 2.4 |
|
438 */ |
|
439 virtual UnicodeString pattern() const; |
|
440 |
|
441 |
|
442 /** |
|
443 * Returns the regular expression from which this pattern was compiled. This method will work |
|
444 * even if the pattern was compiled from a UnicodeString. |
|
445 * |
|
446 * Note: This is the original input, not a clone. If the pattern was originally compiled from a |
|
447 * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern |
|
448 * object. |
|
449 * |
|
450 * @stable ICU 4.6 |
|
451 */ |
|
452 virtual UText *patternText(UErrorCode &status) const; |
|
453 |
|
454 |
|
455 /** |
|
456 * Split a string into fields. Somewhat like split() from Perl or Java. |
|
457 * Pattern matches identify delimiters that separate the input |
|
458 * into fields. The input data between the delimiters becomes the |
|
459 * fields themselves. |
|
460 * |
|
461 * If the delimiter pattern includes capture groups, the captured text will |
|
462 * also appear in the destination array of output strings, interspersed |
|
463 * with the fields. This is similar to Perl, but differs from Java, |
|
464 * which ignores the presence of capture groups in the pattern. |
|
465 * |
|
466 * Trailing empty fields will always be returned, assuming sufficient |
|
467 * destination capacity. This differs from the default behavior for Java |
|
468 * and Perl where trailing empty fields are not returned. |
|
469 * |
|
470 * The number of strings produced by the split operation is returned. |
|
471 * This count includes the strings from capture groups in the delimiter pattern. |
|
472 * This behavior differs from Java, which ignores capture groups. |
|
473 * |
|
474 * For the best performance on split() operations, |
|
475 * <code>RegexMatcher::split</code> is preferable to this function |
|
476 * |
|
477 * @param input The string to be split into fields. The field delimiters |
|
478 * match the pattern (in the "this" object) |
|
479 * @param dest An array of UnicodeStrings to receive the results of the split. |
|
480 * This is an array of actual UnicodeString objects, not an |
|
481 * array of pointers to strings. Local (stack based) arrays can |
|
482 * work well here. |
|
483 * @param destCapacity The number of elements in the destination array. |
|
484 * If the number of fields found is less than destCapacity, the |
|
485 * extra strings in the destination array are not altered. |
|
486 * If the number of destination strings is less than the number |
|
487 * of fields, the trailing part of the input string, including any |
|
488 * field delimiters, is placed in the last destination string. |
|
489 * @param status A reference to a UErrorCode to receive any errors. |
|
490 * @return The number of fields into which the input string was split. |
|
491 * @stable ICU 2.4 |
|
492 */ |
|
493 virtual int32_t split(const UnicodeString &input, |
|
494 UnicodeString dest[], |
|
495 int32_t destCapacity, |
|
496 UErrorCode &status) const; |
|
497 |
|
498 |
|
499 /** |
|
500 * Split a string into fields. Somewhat like split() from Perl or Java. |
|
501 * Pattern matches identify delimiters that separate the input |
|
502 * into fields. The input data between the delimiters becomes the |
|
503 * fields themselves. |
|
504 * |
|
505 * If the delimiter pattern includes capture groups, the captured text will |
|
506 * also appear in the destination array of output strings, interspersed |
|
507 * with the fields. This is similar to Perl, but differs from Java, |
|
508 * which ignores the presence of capture groups in the pattern. |
|
509 * |
|
510 * Trailing empty fields will always be returned, assuming sufficient |
|
511 * destination capacity. This differs from the default behavior for Java |
|
512 * and Perl where trailing empty fields are not returned. |
|
513 * |
|
514 * The number of strings produced by the split operation is returned. |
|
515 * This count includes the strings from capture groups in the delimiter pattern. |
|
516 * This behavior differs from Java, which ignores capture groups. |
|
517 * |
|
518 * For the best performance on split() operations, |
|
519 * <code>RegexMatcher::split</code> is preferable to this function |
|
520 * |
|
521 * @param input The string to be split into fields. The field delimiters |
|
522 * match the pattern (in the "this" object) |
|
523 * @param dest An array of mutable UText structs to receive the results of the split. |
|
524 * If a field is NULL, a new UText is allocated to contain the results for |
|
525 * that field. This new UText is not guaranteed to be mutable. |
|
526 * @param destCapacity The number of elements in the destination array. |
|
527 * If the number of fields found is less than destCapacity, the |
|
528 * extra strings in the destination array are not altered. |
|
529 * If the number of destination strings is less than the number |
|
530 * of fields, the trailing part of the input string, including any |
|
531 * field delimiters, is placed in the last destination string. |
|
532 * @param status A reference to a UErrorCode to receive any errors. |
|
533 * @return The number of destination strings used. |
|
534 * |
|
535 * @stable ICU 4.6 |
|
536 */ |
|
537 virtual int32_t split(UText *input, |
|
538 UText *dest[], |
|
539 int32_t destCapacity, |
|
540 UErrorCode &status) const; |
|
541 |
|
542 |
|
543 /** |
|
544 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
|
545 * |
|
546 * @stable ICU 2.4 |
|
547 */ |
|
548 virtual UClassID getDynamicClassID() const; |
|
549 |
|
550 /** |
|
551 * ICU "poor man's RTTI", returns a UClassID for this class. |
|
552 * |
|
553 * @stable ICU 2.4 |
|
554 */ |
|
555 static UClassID U_EXPORT2 getStaticClassID(); |
|
556 |
|
557 private: |
|
558 // |
|
559 // Implementation Data |
|
560 // |
|
561 UText *fPattern; // The original pattern string. |
|
562 UnicodeString *fPatternString; // The original pattern UncodeString if relevant |
|
563 uint32_t fFlags; // The flags used when compiling the pattern. |
|
564 // |
|
565 UVector64 *fCompiledPat; // The compiled pattern p-code. |
|
566 UnicodeString fLiteralText; // Any literal string data from the pattern, |
|
567 // after un-escaping, for use during the match. |
|
568 |
|
569 UVector *fSets; // Any UnicodeSets referenced from the pattern. |
|
570 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) |
|
571 |
|
572 |
|
573 UErrorCode fDeferredStatus; // status if some prior error has left this |
|
574 // RegexPattern in an unusable state. |
|
575 |
|
576 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length |
|
577 // >= this value. For some patterns, this calculated |
|
578 // value may be less than the true shortest |
|
579 // possible match. |
|
580 |
|
581 int32_t fFrameSize; // Size of a state stack frame in the |
|
582 // execution engine. |
|
583 |
|
584 int32_t fDataSize; // The size of the data needed by the pattern that |
|
585 // does not go on the state stack, but has just |
|
586 // a single copy per matcher. |
|
587 |
|
588 UVector32 *fGroupMap; // Map from capture group number to position of |
|
589 // the group's variables in the matcher stack frame. |
|
590 |
|
591 int32_t fMaxCaptureDigits; |
|
592 |
|
593 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined |
|
594 // regex character classes, e.g. Word. |
|
595 |
|
596 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only |
|
597 // sets for predefined regex classes. |
|
598 |
|
599 int32_t fStartType; // Info on how a match must start. |
|
600 int32_t fInitialStringIdx; // |
|
601 int32_t fInitialStringLen; |
|
602 UnicodeSet *fInitialChars; |
|
603 UChar32 fInitialChar; |
|
604 Regex8BitSet *fInitialChars8; |
|
605 UBool fNeedsAltInput; |
|
606 |
|
607 friend class RegexCompile; |
|
608 friend class RegexMatcher; |
|
609 friend class RegexCImpl; |
|
610 |
|
611 // |
|
612 // Implementation Methods |
|
613 // |
|
614 void init(); // Common initialization, for use by constructors. |
|
615 void zap(); // Common cleanup |
|
616 #ifdef REGEX_DEBUG |
|
617 void dumpOp(int32_t index) const; |
|
618 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); |
|
619 #endif |
|
620 |
|
621 }; |
|
622 |
|
623 |
|
624 |
|
625 /** |
|
626 * class RegexMatcher bundles together a regular expression pattern and |
|
627 * input text to which the expression can be applied. It includes methods |
|
628 * for testing for matches, and for find and replace operations. |
|
629 * |
|
630 * <p>Class RegexMatcher is not intended to be subclassed.</p> |
|
631 * |
|
632 * @stable ICU 2.4 |
|
633 */ |
|
634 class U_I18N_API RegexMatcher: public UObject { |
|
635 public: |
|
636 |
|
637 /** |
|
638 * Construct a RegexMatcher for a regular expression. |
|
639 * This is a convenience method that avoids the need to explicitly create |
|
640 * a RegexPattern object. Note that if several RegexMatchers need to be |
|
641 * created for the same expression, it will be more efficient to |
|
642 * separately create and cache a RegexPattern object, and use |
|
643 * its matcher() method to create the RegexMatcher objects. |
|
644 * |
|
645 * @param regexp The Regular Expression to be compiled. |
|
646 * @param flags Regular expression options, such as case insensitive matching. |
|
647 * @see UREGEX_CASE_INSENSITIVE |
|
648 * @param status Any errors are reported by setting this UErrorCode variable. |
|
649 * @stable ICU 2.6 |
|
650 */ |
|
651 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); |
|
652 |
|
653 /** |
|
654 * Construct a RegexMatcher for a regular expression. |
|
655 * This is a convenience method that avoids the need to explicitly create |
|
656 * a RegexPattern object. Note that if several RegexMatchers need to be |
|
657 * created for the same expression, it will be more efficient to |
|
658 * separately create and cache a RegexPattern object, and use |
|
659 * its matcher() method to create the RegexMatcher objects. |
|
660 * |
|
661 * @param regexp The regular expression to be compiled. |
|
662 * @param flags Regular expression options, such as case insensitive matching. |
|
663 * @see UREGEX_CASE_INSENSITIVE |
|
664 * @param status Any errors are reported by setting this UErrorCode variable. |
|
665 * |
|
666 * @stable ICU 4.6 |
|
667 */ |
|
668 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); |
|
669 |
|
670 /** |
|
671 * Construct a RegexMatcher for a regular expression. |
|
672 * This is a convenience method that avoids the need to explicitly create |
|
673 * a RegexPattern object. Note that if several RegexMatchers need to be |
|
674 * created for the same expression, it will be more efficient to |
|
675 * separately create and cache a RegexPattern object, and use |
|
676 * its matcher() method to create the RegexMatcher objects. |
|
677 * <p> |
|
678 * The matcher will retain a reference to the supplied input string, and all regexp |
|
679 * pattern matching operations happen directly on the original string. It is |
|
680 * critical that the string not be altered or deleted before use by the regular |
|
681 * expression operations is complete. |
|
682 * |
|
683 * @param regexp The Regular Expression to be compiled. |
|
684 * @param input The string to match. The matcher retains a reference to the |
|
685 * caller's string; mo copy is made. |
|
686 * @param flags Regular expression options, such as case insensitive matching. |
|
687 * @see UREGEX_CASE_INSENSITIVE |
|
688 * @param status Any errors are reported by setting this UErrorCode variable. |
|
689 * @stable ICU 2.6 |
|
690 */ |
|
691 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, |
|
692 uint32_t flags, UErrorCode &status); |
|
693 |
|
694 /** |
|
695 * Construct a RegexMatcher for a regular expression. |
|
696 * This is a convenience method that avoids the need to explicitly create |
|
697 * a RegexPattern object. Note that if several RegexMatchers need to be |
|
698 * created for the same expression, it will be more efficient to |
|
699 * separately create and cache a RegexPattern object, and use |
|
700 * its matcher() method to create the RegexMatcher objects. |
|
701 * <p> |
|
702 * The matcher will make a shallow clone of the supplied input text, and all regexp |
|
703 * pattern matching operations happen on this clone. While read-only operations on |
|
704 * the supplied text are permitted, it is critical that the underlying string not be |
|
705 * altered or deleted before use by the regular expression operations is complete. |
|
706 * |
|
707 * @param regexp The Regular Expression to be compiled. |
|
708 * @param input The string to match. The matcher retains a shallow clone of the text. |
|
709 * @param flags Regular expression options, such as case insensitive matching. |
|
710 * @see UREGEX_CASE_INSENSITIVE |
|
711 * @param status Any errors are reported by setting this UErrorCode variable. |
|
712 * |
|
713 * @stable ICU 4.6 |
|
714 */ |
|
715 RegexMatcher(UText *regexp, UText *input, |
|
716 uint32_t flags, UErrorCode &status); |
|
717 |
|
718 private: |
|
719 /** |
|
720 * Cause a compilation error if an application accidentally attempts to |
|
721 * create a matcher with a (UChar *) string as input rather than |
|
722 * a UnicodeString. Avoids a dangling reference to a temporary string. |
|
723 * <p> |
|
724 * To efficiently work with UChar *strings, wrap the data in a UnicodeString |
|
725 * using one of the aliasing constructors, such as |
|
726 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> |
|
727 * or in a UText, using |
|
728 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> |
|
729 * |
|
730 */ |
|
731 RegexMatcher(const UnicodeString ®exp, const UChar *input, |
|
732 uint32_t flags, UErrorCode &status); |
|
733 public: |
|
734 |
|
735 |
|
736 /** |
|
737 * Destructor. |
|
738 * |
|
739 * @stable ICU 2.4 |
|
740 */ |
|
741 virtual ~RegexMatcher(); |
|
742 |
|
743 |
|
744 /** |
|
745 * Attempts to match the entire input region against the pattern. |
|
746 * @param status A reference to a UErrorCode to receive any errors. |
|
747 * @return TRUE if there is a match |
|
748 * @stable ICU 2.4 |
|
749 */ |
|
750 virtual UBool matches(UErrorCode &status); |
|
751 |
|
752 |
|
753 /** |
|
754 * Resets the matcher, then attempts to match the input beginning |
|
755 * at the specified startIndex, and extending to the end of the input. |
|
756 * The input region is reset to include the entire input string. |
|
757 * A successful match must extend to the end of the input. |
|
758 * @param startIndex The input string (native) index at which to begin matching. |
|
759 * @param status A reference to a UErrorCode to receive any errors. |
|
760 * @return TRUE if there is a match |
|
761 * @stable ICU 2.8 |
|
762 */ |
|
763 virtual UBool matches(int64_t startIndex, UErrorCode &status); |
|
764 |
|
765 |
|
766 /** |
|
767 * Attempts to match the input string, starting from the beginning of the region, |
|
768 * against the pattern. Like the matches() method, this function |
|
769 * always starts at the beginning of the input region; |
|
770 * unlike that function, it does not require that the entire region be matched. |
|
771 * |
|
772 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, |
|
773 * <code>end()</code>, and <code>group()</code> functions.</p> |
|
774 * |
|
775 * @param status A reference to a UErrorCode to receive any errors. |
|
776 * @return TRUE if there is a match at the start of the input string. |
|
777 * @stable ICU 2.4 |
|
778 */ |
|
779 virtual UBool lookingAt(UErrorCode &status); |
|
780 |
|
781 |
|
782 /** |
|
783 * Attempts to match the input string, starting from the specified index, against the pattern. |
|
784 * The match may be of any length, and is not required to extend to the end |
|
785 * of the input string. Contrast with match(). |
|
786 * |
|
787 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, |
|
788 * <code>end()</code>, and <code>group()</code> functions.</p> |
|
789 * |
|
790 * @param startIndex The input string (native) index at which to begin matching. |
|
791 * @param status A reference to a UErrorCode to receive any errors. |
|
792 * @return TRUE if there is a match. |
|
793 * @stable ICU 2.8 |
|
794 */ |
|
795 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); |
|
796 |
|
797 |
|
798 /** |
|
799 * Find the next pattern match in the input string. |
|
800 * The find begins searching the input at the location following the end of |
|
801 * the previous match, or at the start of the string if there is no previous match. |
|
802 * If a match is found, <code>start(), end()</code> and <code>group()</code> |
|
803 * will provide more information regarding the match. |
|
804 * <p>Note that if the input string is changed by the application, |
|
805 * use find(startPos, status) instead of find(), because the saved starting |
|
806 * position may not be valid with the altered input string.</p> |
|
807 * @return TRUE if a match is found. |
|
808 * @stable ICU 2.4 |
|
809 */ |
|
810 virtual UBool find(); |
|
811 |
|
812 |
|
813 /** |
|
814 * Resets this RegexMatcher and then attempts to find the next substring of the |
|
815 * input string that matches the pattern, starting at the specified index. |
|
816 * |
|
817 * @param start The (native) index in the input string to begin the search. |
|
818 * @param status A reference to a UErrorCode to receive any errors. |
|
819 * @return TRUE if a match is found. |
|
820 * @stable ICU 2.4 |
|
821 */ |
|
822 virtual UBool find(int64_t start, UErrorCode &status); |
|
823 |
|
824 |
|
825 /** |
|
826 * Returns a string containing the text matched by the previous match. |
|
827 * If the pattern can match an empty string, an empty string may be returned. |
|
828 * @param status A reference to a UErrorCode to receive any errors. |
|
829 * Possible errors are U_REGEX_INVALID_STATE if no match |
|
830 * has been attempted or the last match failed. |
|
831 * @return a string containing the matched input text. |
|
832 * @stable ICU 2.4 |
|
833 */ |
|
834 virtual UnicodeString group(UErrorCode &status) const; |
|
835 |
|
836 |
|
837 /** |
|
838 * Returns a string containing the text captured by the given group |
|
839 * during the previous match operation. Group(0) is the entire match. |
|
840 * |
|
841 * @param groupNum the capture group number |
|
842 * @param status A reference to a UErrorCode to receive any errors. |
|
843 * Possible errors are U_REGEX_INVALID_STATE if no match |
|
844 * has been attempted or the last match failed and |
|
845 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. |
|
846 * @return the captured text |
|
847 * @stable ICU 2.4 |
|
848 */ |
|
849 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; |
|
850 |
|
851 |
|
852 /** |
|
853 * Returns the number of capturing groups in this matcher's pattern. |
|
854 * @return the number of capture groups |
|
855 * @stable ICU 2.4 |
|
856 */ |
|
857 virtual int32_t groupCount() const; |
|
858 |
|
859 |
|
860 /** |
|
861 * Returns a shallow clone of the entire live input string with the UText current native index |
|
862 * set to the beginning of the requested group. |
|
863 * |
|
864 * @param dest The UText into which the input should be cloned, or NULL to create a new UText |
|
865 * @param group_len A reference to receive the length of the desired capture group |
|
866 * @param status A reference to a UErrorCode to receive any errors. |
|
867 * Possible errors are U_REGEX_INVALID_STATE if no match |
|
868 * has been attempted or the last match failed and |
|
869 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. |
|
870 * @return dest if non-NULL, a shallow copy of the input text otherwise |
|
871 * |
|
872 * @stable ICU 4.6 |
|
873 */ |
|
874 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; |
|
875 |
|
876 /** |
|
877 * Returns a shallow clone of the entire live input string with the UText current native index |
|
878 * set to the beginning of the requested group. |
|
879 * |
|
880 * @param groupNum The capture group number. |
|
881 * @param dest The UText into which the input should be cloned, or NULL to create a new UText. |
|
882 * @param group_len A reference to receive the length of the desired capture group |
|
883 * @param status A reference to a UErrorCode to receive any errors. |
|
884 * Possible errors are U_REGEX_INVALID_STATE if no match |
|
885 * has been attempted or the last match failed and |
|
886 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. |
|
887 * @return dest if non-NULL, a shallow copy of the input text otherwise |
|
888 * |
|
889 * @stable ICU 4.6 |
|
890 */ |
|
891 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; |
|
892 |
|
893 /** |
|
894 * Returns a string containing the text captured by the given group |
|
895 * during the previous match operation. Group(0) is the entire match. |
|
896 * |
|
897 * @param groupNum the capture group number |
|
898 * @param dest A mutable UText in which the matching text is placed. |
|
899 * If NULL, a new UText will be created (which may not be mutable). |
|
900 * @param status A reference to a UErrorCode to receive any errors. |
|
901 * Possible errors are U_REGEX_INVALID_STATE if no match |
|
902 * has been attempted or the last match failed. |
|
903 * @return A string containing the matched input text. If a pre-allocated UText |
|
904 * was provided, it will always be used and returned. |
|
905 * |
|
906 * @internal ICU 4.4 technology preview |
|
907 */ |
|
908 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; |
|
909 |
|
910 |
|
911 /** |
|
912 * Returns the index in the input string of the start of the text matched |
|
913 * during the previous match operation. |
|
914 * @param status a reference to a UErrorCode to receive any errors. |
|
915 * @return The (native) position in the input string of the start of the last match. |
|
916 * @stable ICU 2.4 |
|
917 */ |
|
918 virtual int32_t start(UErrorCode &status) const; |
|
919 |
|
920 /** |
|
921 * Returns the index in the input string of the start of the text matched |
|
922 * during the previous match operation. |
|
923 * @param status a reference to a UErrorCode to receive any errors. |
|
924 * @return The (native) position in the input string of the start of the last match. |
|
925 * @stable ICU 4.6 |
|
926 */ |
|
927 virtual int64_t start64(UErrorCode &status) const; |
|
928 |
|
929 |
|
930 /** |
|
931 * Returns the index in the input string of the start of the text matched by the |
|
932 * specified capture group during the previous match operation. Return -1 if |
|
933 * the capture group exists in the pattern, but was not part of the last match. |
|
934 * |
|
935 * @param group the capture group number |
|
936 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
937 * errors are U_REGEX_INVALID_STATE if no match has been |
|
938 * attempted or the last match failed, and |
|
939 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number |
|
940 * @return the (native) start position of substring matched by the specified group. |
|
941 * @stable ICU 2.4 |
|
942 */ |
|
943 virtual int32_t start(int32_t group, UErrorCode &status) const; |
|
944 |
|
945 /** |
|
946 * Returns the index in the input string of the start of the text matched by the |
|
947 * specified capture group during the previous match operation. Return -1 if |
|
948 * the capture group exists in the pattern, but was not part of the last match. |
|
949 * |
|
950 * @param group the capture group number. |
|
951 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
952 * errors are U_REGEX_INVALID_STATE if no match has been |
|
953 * attempted or the last match failed, and |
|
954 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. |
|
955 * @return the (native) start position of substring matched by the specified group. |
|
956 * @stable ICU 4.6 |
|
957 */ |
|
958 virtual int64_t start64(int32_t group, UErrorCode &status) const; |
|
959 |
|
960 |
|
961 /** |
|
962 * Returns the index in the input string of the first character following the |
|
963 * text matched during the previous match operation. |
|
964 * |
|
965 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
966 * errors are U_REGEX_INVALID_STATE if no match has been |
|
967 * attempted or the last match failed. |
|
968 * @return the index of the last character matched, plus one. |
|
969 * The index value returned is a native index, corresponding to |
|
970 * code units for the underlying encoding type, for example, |
|
971 * a byte index for UTF-8. |
|
972 * @stable ICU 2.4 |
|
973 */ |
|
974 virtual int32_t end(UErrorCode &status) const; |
|
975 |
|
976 /** |
|
977 * Returns the index in the input string of the first character following the |
|
978 * text matched during the previous match operation. |
|
979 * |
|
980 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
981 * errors are U_REGEX_INVALID_STATE if no match has been |
|
982 * attempted or the last match failed. |
|
983 * @return the index of the last character matched, plus one. |
|
984 * The index value returned is a native index, corresponding to |
|
985 * code units for the underlying encoding type, for example, |
|
986 * a byte index for UTF-8. |
|
987 * @stable ICU 4.6 |
|
988 */ |
|
989 virtual int64_t end64(UErrorCode &status) const; |
|
990 |
|
991 |
|
992 /** |
|
993 * Returns the index in the input string of the character following the |
|
994 * text matched by the specified capture group during the previous match operation. |
|
995 * |
|
996 * @param group the capture group number |
|
997 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
998 * errors are U_REGEX_INVALID_STATE if no match has been |
|
999 * attempted or the last match failed and |
|
1000 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number |
|
1001 * @return the index of the first character following the text |
|
1002 * captured by the specified group during the previous match operation. |
|
1003 * Return -1 if the capture group exists in the pattern but was not part of the match. |
|
1004 * The index value returned is a native index, corresponding to |
|
1005 * code units for the underlying encoding type, for example, |
|
1006 * a byte index for UTF8. |
|
1007 * @stable ICU 2.4 |
|
1008 */ |
|
1009 virtual int32_t end(int32_t group, UErrorCode &status) const; |
|
1010 |
|
1011 /** |
|
1012 * Returns the index in the input string of the character following the |
|
1013 * text matched by the specified capture group during the previous match operation. |
|
1014 * |
|
1015 * @param group the capture group number |
|
1016 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
1017 * errors are U_REGEX_INVALID_STATE if no match has been |
|
1018 * attempted or the last match failed and |
|
1019 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number |
|
1020 * @return the index of the first character following the text |
|
1021 * captured by the specified group during the previous match operation. |
|
1022 * Return -1 if the capture group exists in the pattern but was not part of the match. |
|
1023 * The index value returned is a native index, corresponding to |
|
1024 * code units for the underlying encoding type, for example, |
|
1025 * a byte index for UTF8. |
|
1026 * @stable ICU 4.6 |
|
1027 */ |
|
1028 virtual int64_t end64(int32_t group, UErrorCode &status) const; |
|
1029 |
|
1030 |
|
1031 /** |
|
1032 * Resets this matcher. The effect is to remove any memory of previous matches, |
|
1033 * and to cause subsequent find() operations to begin at the beginning of |
|
1034 * the input string. |
|
1035 * |
|
1036 * @return this RegexMatcher. |
|
1037 * @stable ICU 2.4 |
|
1038 */ |
|
1039 virtual RegexMatcher &reset(); |
|
1040 |
|
1041 |
|
1042 /** |
|
1043 * Resets this matcher, and set the current input position. |
|
1044 * The effect is to remove any memory of previous matches, |
|
1045 * and to cause subsequent find() operations to begin at |
|
1046 * the specified (native) position in the input string. |
|
1047 * <p> |
|
1048 * The matcher's region is reset to its default, which is the entire |
|
1049 * input string. |
|
1050 * <p> |
|
1051 * An alternative to this function is to set a match region |
|
1052 * beginning at the desired index. |
|
1053 * |
|
1054 * @return this RegexMatcher. |
|
1055 * @stable ICU 2.8 |
|
1056 */ |
|
1057 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); |
|
1058 |
|
1059 |
|
1060 /** |
|
1061 * Resets this matcher with a new input string. This allows instances of RegexMatcher |
|
1062 * to be reused, which is more efficient than creating a new RegexMatcher for |
|
1063 * each input string to be processed. |
|
1064 * @param input The new string on which subsequent pattern matches will operate. |
|
1065 * The matcher retains a reference to the callers string, and operates |
|
1066 * directly on that. Ownership of the string remains with the caller. |
|
1067 * Because no copy of the string is made, it is essential that the |
|
1068 * caller not delete the string until after regexp operations on it |
|
1069 * are done. |
|
1070 * Note that while a reset on the matcher with an input string that is then |
|
1071 * modified across/during matcher operations may be supported currently for UnicodeString, |
|
1072 * this was not originally intended behavior, and support for this is not guaranteed |
|
1073 * in upcoming versions of ICU. |
|
1074 * @return this RegexMatcher. |
|
1075 * @stable ICU 2.4 |
|
1076 */ |
|
1077 virtual RegexMatcher &reset(const UnicodeString &input); |
|
1078 |
|
1079 |
|
1080 /** |
|
1081 * Resets this matcher with a new input string. This allows instances of RegexMatcher |
|
1082 * to be reused, which is more efficient than creating a new RegexMatcher for |
|
1083 * each input string to be processed. |
|
1084 * @param input The new string on which subsequent pattern matches will operate. |
|
1085 * The matcher makes a shallow clone of the given text; ownership of the |
|
1086 * original string remains with the caller. Because no deep copy of the |
|
1087 * text is made, it is essential that the caller not modify the string |
|
1088 * until after regexp operations on it are done. |
|
1089 * @return this RegexMatcher. |
|
1090 * |
|
1091 * @stable ICU 4.6 |
|
1092 */ |
|
1093 virtual RegexMatcher &reset(UText *input); |
|
1094 |
|
1095 |
|
1096 /** |
|
1097 * Set the subject text string upon which the regular expression is looking for matches |
|
1098 * without changing any other aspect of the matching state. |
|
1099 * The new and previous text strings must have the same content. |
|
1100 * |
|
1101 * This function is intended for use in environments where ICU is operating on |
|
1102 * strings that may move around in memory. It provides a mechanism for notifying |
|
1103 * ICU that the string has been relocated, and providing a new UText to access the |
|
1104 * string in its new position. |
|
1105 * |
|
1106 * Note that the regular expression implementation never copies the underlying text |
|
1107 * of a string being matched, but always operates directly on the original text |
|
1108 * provided by the user. Refreshing simply drops the references to the old text |
|
1109 * and replaces them with references to the new. |
|
1110 * |
|
1111 * Caution: this function is normally used only by very specialized, |
|
1112 * system-level code. One example use case is with garbage collection that moves |
|
1113 * the text in memory. |
|
1114 * |
|
1115 * @param input The new (moved) text string. |
|
1116 * @param status Receives errors detected by this function. |
|
1117 * |
|
1118 * @stable ICU 4.8 |
|
1119 */ |
|
1120 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status); |
|
1121 |
|
1122 private: |
|
1123 /** |
|
1124 * Cause a compilation error if an application accidentally attempts to |
|
1125 * reset a matcher with a (UChar *) string as input rather than |
|
1126 * a UnicodeString. Avoids a dangling reference to a temporary string. |
|
1127 * <p> |
|
1128 * To efficiently work with UChar *strings, wrap the data in a UnicodeString |
|
1129 * using one of the aliasing constructors, such as |
|
1130 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> |
|
1131 * or in a UText, using |
|
1132 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> |
|
1133 * |
|
1134 */ |
|
1135 RegexMatcher &reset(const UChar *input); |
|
1136 public: |
|
1137 |
|
1138 /** |
|
1139 * Returns the input string being matched. Ownership of the string belongs to |
|
1140 * the matcher; it should not be altered or deleted. This method will work even if the input |
|
1141 * was originally supplied as a UText. |
|
1142 * @return the input string |
|
1143 * @stable ICU 2.4 |
|
1144 */ |
|
1145 virtual const UnicodeString &input() const; |
|
1146 |
|
1147 /** |
|
1148 * Returns the input string being matched. This is the live input text; it should not be |
|
1149 * altered or deleted. This method will work even if the input was originally supplied as |
|
1150 * a UnicodeString. |
|
1151 * @return the input text |
|
1152 * |
|
1153 * @stable ICU 4.6 |
|
1154 */ |
|
1155 virtual UText *inputText() const; |
|
1156 |
|
1157 /** |
|
1158 * Returns the input string being matched, either by copying it into the provided |
|
1159 * UText parameter or by returning a shallow clone of the live input. Note that copying |
|
1160 * the entire input may cause significant performance and memory issues. |
|
1161 * @param dest The UText into which the input should be copied, or NULL to create a new UText |
|
1162 * @param status error code |
|
1163 * @return dest if non-NULL, a shallow copy of the input text otherwise |
|
1164 * |
|
1165 * @stable ICU 4.6 |
|
1166 */ |
|
1167 virtual UText *getInput(UText *dest, UErrorCode &status) const; |
|
1168 |
|
1169 |
|
1170 /** Sets the limits of this matcher's region. |
|
1171 * The region is the part of the input string that will be searched to find a match. |
|
1172 * Invoking this method resets the matcher, and then sets the region to start |
|
1173 * at the index specified by the start parameter and end at the index specified |
|
1174 * by the end parameter. |
|
1175 * |
|
1176 * Depending on the transparency and anchoring being used (see useTransparentBounds |
|
1177 * and useAnchoringBounds), certain constructs such as anchors may behave differently |
|
1178 * at or around the boundaries of the region |
|
1179 * |
|
1180 * The function will fail if start is greater than limit, or if either index |
|
1181 * is less than zero or greater than the length of the string being matched. |
|
1182 * |
|
1183 * @param start The (native) index to begin searches at. |
|
1184 * @param limit The index to end searches at (exclusive). |
|
1185 * @param status A reference to a UErrorCode to receive any errors. |
|
1186 * @stable ICU 4.0 |
|
1187 */ |
|
1188 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); |
|
1189 |
|
1190 /** |
|
1191 * Identical to region(start, limit, status) but also allows a start position without |
|
1192 * resetting the region state. |
|
1193 * @param regionStart The region start |
|
1194 * @param regionLimit the limit of the region |
|
1195 * @param startIndex The (native) index within the region bounds at which to begin searches. |
|
1196 * @param status A reference to a UErrorCode to receive any errors. |
|
1197 * If startIndex is not within the specified region bounds, |
|
1198 * U_INDEX_OUTOFBOUNDS_ERROR is returned. |
|
1199 * @stable ICU 4.6 |
|
1200 */ |
|
1201 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); |
|
1202 |
|
1203 /** |
|
1204 * Reports the start index of this matcher's region. The searches this matcher |
|
1205 * conducts are limited to finding matches within regionStart (inclusive) and |
|
1206 * regionEnd (exclusive). |
|
1207 * |
|
1208 * @return The starting (native) index of this matcher's region. |
|
1209 * @stable ICU 4.0 |
|
1210 */ |
|
1211 virtual int32_t regionStart() const; |
|
1212 |
|
1213 /** |
|
1214 * Reports the start index of this matcher's region. The searches this matcher |
|
1215 * conducts are limited to finding matches within regionStart (inclusive) and |
|
1216 * regionEnd (exclusive). |
|
1217 * |
|
1218 * @return The starting (native) index of this matcher's region. |
|
1219 * @stable ICU 4.6 |
|
1220 */ |
|
1221 virtual int64_t regionStart64() const; |
|
1222 |
|
1223 |
|
1224 /** |
|
1225 * Reports the end (limit) index (exclusive) of this matcher's region. The searches |
|
1226 * this matcher conducts are limited to finding matches within regionStart |
|
1227 * (inclusive) and regionEnd (exclusive). |
|
1228 * |
|
1229 * @return The ending point (native) of this matcher's region. |
|
1230 * @stable ICU 4.0 |
|
1231 */ |
|
1232 virtual int32_t regionEnd() const; |
|
1233 |
|
1234 /** |
|
1235 * Reports the end (limit) index (exclusive) of this matcher's region. The searches |
|
1236 * this matcher conducts are limited to finding matches within regionStart |
|
1237 * (inclusive) and regionEnd (exclusive). |
|
1238 * |
|
1239 * @return The ending point (native) of this matcher's region. |
|
1240 * @stable ICU 4.6 |
|
1241 */ |
|
1242 virtual int64_t regionEnd64() const; |
|
1243 |
|
1244 /** |
|
1245 * Queries the transparency of region bounds for this matcher. |
|
1246 * See useTransparentBounds for a description of transparent and opaque bounds. |
|
1247 * By default, a matcher uses opaque region boundaries. |
|
1248 * |
|
1249 * @return TRUE if this matcher is using opaque bounds, false if it is not. |
|
1250 * @stable ICU 4.0 |
|
1251 */ |
|
1252 virtual UBool hasTransparentBounds() const; |
|
1253 |
|
1254 /** |
|
1255 * Sets the transparency of region bounds for this matcher. |
|
1256 * Invoking this function with an argument of true will set this matcher to use transparent bounds. |
|
1257 * If the boolean argument is false, then opaque bounds will be used. |
|
1258 * |
|
1259 * Using transparent bounds, the boundaries of this matcher's region are transparent |
|
1260 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can |
|
1261 * see text beyond the boundaries of the region while checking for a match. |
|
1262 * |
|
1263 * With opaque bounds, no text outside of the matcher's region is visible to lookahead, |
|
1264 * lookbehind, and boundary matching constructs. |
|
1265 * |
|
1266 * By default, a matcher uses opaque bounds. |
|
1267 * |
|
1268 * @param b TRUE for transparent bounds; FALSE for opaque bounds |
|
1269 * @return This Matcher; |
|
1270 * @stable ICU 4.0 |
|
1271 **/ |
|
1272 virtual RegexMatcher &useTransparentBounds(UBool b); |
|
1273 |
|
1274 |
|
1275 /** |
|
1276 * Return true if this matcher is using anchoring bounds. |
|
1277 * By default, matchers use anchoring region bounds. |
|
1278 * |
|
1279 * @return TRUE if this matcher is using anchoring bounds. |
|
1280 * @stable ICU 4.0 |
|
1281 */ |
|
1282 virtual UBool hasAnchoringBounds() const; |
|
1283 |
|
1284 |
|
1285 /** |
|
1286 * Set whether this matcher is using Anchoring Bounds for its region. |
|
1287 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start |
|
1288 * and end of the region. Without Anchoring Bounds, anchors will only match at |
|
1289 * the positions they would in the complete text. |
|
1290 * |
|
1291 * Anchoring Bounds are the default for regions. |
|
1292 * |
|
1293 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. |
|
1294 * @return This Matcher |
|
1295 * @stable ICU 4.0 |
|
1296 */ |
|
1297 virtual RegexMatcher &useAnchoringBounds(UBool b); |
|
1298 |
|
1299 |
|
1300 /** |
|
1301 * Return TRUE if the most recent matching operation attempted to access |
|
1302 * additional input beyond the available input text. |
|
1303 * In this case, additional input text could change the results of the match. |
|
1304 * |
|
1305 * hitEnd() is defined for both successful and unsuccessful matches. |
|
1306 * In either case hitEnd() will return TRUE if if the end of the text was |
|
1307 * reached at any point during the matching process. |
|
1308 * |
|
1309 * @return TRUE if the most recent match hit the end of input |
|
1310 * @stable ICU 4.0 |
|
1311 */ |
|
1312 virtual UBool hitEnd() const; |
|
1313 |
|
1314 /** |
|
1315 * Return TRUE the most recent match succeeded and additional input could cause |
|
1316 * it to fail. If this method returns false and a match was found, then more input |
|
1317 * might change the match but the match won't be lost. If a match was not found, |
|
1318 * then requireEnd has no meaning. |
|
1319 * |
|
1320 * @return TRUE if more input could cause the most recent match to no longer match. |
|
1321 * @stable ICU 4.0 |
|
1322 */ |
|
1323 virtual UBool requireEnd() const; |
|
1324 |
|
1325 |
|
1326 /** |
|
1327 * Returns the pattern that is interpreted by this matcher. |
|
1328 * @return the RegexPattern for this RegexMatcher |
|
1329 * @stable ICU 2.4 |
|
1330 */ |
|
1331 virtual const RegexPattern &pattern() const; |
|
1332 |
|
1333 |
|
1334 /** |
|
1335 * Replaces every substring of the input that matches the pattern |
|
1336 * with the given replacement string. This is a convenience function that |
|
1337 * provides a complete find-and-replace-all operation. |
|
1338 * |
|
1339 * This method first resets this matcher. It then scans the input string |
|
1340 * looking for matches of the pattern. Input that is not part of any |
|
1341 * match is left unchanged; each match is replaced in the result by the |
|
1342 * replacement string. The replacement string may contain references to |
|
1343 * capture groups. |
|
1344 * |
|
1345 * @param replacement a string containing the replacement text. |
|
1346 * @param status a reference to a UErrorCode to receive any errors. |
|
1347 * @return a string containing the results of the find and replace. |
|
1348 * @stable ICU 2.4 |
|
1349 */ |
|
1350 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); |
|
1351 |
|
1352 |
|
1353 /** |
|
1354 * Replaces every substring of the input that matches the pattern |
|
1355 * with the given replacement string. This is a convenience function that |
|
1356 * provides a complete find-and-replace-all operation. |
|
1357 * |
|
1358 * This method first resets this matcher. It then scans the input string |
|
1359 * looking for matches of the pattern. Input that is not part of any |
|
1360 * match is left unchanged; each match is replaced in the result by the |
|
1361 * replacement string. The replacement string may contain references to |
|
1362 * capture groups. |
|
1363 * |
|
1364 * @param replacement a string containing the replacement text. |
|
1365 * @param dest a mutable UText in which the results are placed. |
|
1366 * If NULL, a new UText will be created (which may not be mutable). |
|
1367 * @param status a reference to a UErrorCode to receive any errors. |
|
1368 * @return a string containing the results of the find and replace. |
|
1369 * If a pre-allocated UText was provided, it will always be used and returned. |
|
1370 * |
|
1371 * @stable ICU 4.6 |
|
1372 */ |
|
1373 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); |
|
1374 |
|
1375 |
|
1376 /** |
|
1377 * Replaces the first substring of the input that matches |
|
1378 * the pattern with the replacement string. This is a convenience |
|
1379 * function that provides a complete find-and-replace operation. |
|
1380 * |
|
1381 * <p>This function first resets this RegexMatcher. It then scans the input string |
|
1382 * looking for a match of the pattern. Input that is not part |
|
1383 * of the match is appended directly to the result string; the match is replaced |
|
1384 * in the result by the replacement string. The replacement string may contain |
|
1385 * references to captured groups.</p> |
|
1386 * |
|
1387 * <p>The state of the matcher (the position at which a subsequent find() |
|
1388 * would begin) after completing a replaceFirst() is not specified. The |
|
1389 * RegexMatcher should be reset before doing additional find() operations.</p> |
|
1390 * |
|
1391 * @param replacement a string containing the replacement text. |
|
1392 * @param status a reference to a UErrorCode to receive any errors. |
|
1393 * @return a string containing the results of the find and replace. |
|
1394 * @stable ICU 2.4 |
|
1395 */ |
|
1396 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); |
|
1397 |
|
1398 |
|
1399 /** |
|
1400 * Replaces the first substring of the input that matches |
|
1401 * the pattern with the replacement string. This is a convenience |
|
1402 * function that provides a complete find-and-replace operation. |
|
1403 * |
|
1404 * <p>This function first resets this RegexMatcher. It then scans the input string |
|
1405 * looking for a match of the pattern. Input that is not part |
|
1406 * of the match is appended directly to the result string; the match is replaced |
|
1407 * in the result by the replacement string. The replacement string may contain |
|
1408 * references to captured groups.</p> |
|
1409 * |
|
1410 * <p>The state of the matcher (the position at which a subsequent find() |
|
1411 * would begin) after completing a replaceFirst() is not specified. The |
|
1412 * RegexMatcher should be reset before doing additional find() operations.</p> |
|
1413 * |
|
1414 * @param replacement a string containing the replacement text. |
|
1415 * @param dest a mutable UText in which the results are placed. |
|
1416 * If NULL, a new UText will be created (which may not be mutable). |
|
1417 * @param status a reference to a UErrorCode to receive any errors. |
|
1418 * @return a string containing the results of the find and replace. |
|
1419 * If a pre-allocated UText was provided, it will always be used and returned. |
|
1420 * |
|
1421 * @stable ICU 4.6 |
|
1422 */ |
|
1423 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); |
|
1424 |
|
1425 |
|
1426 /** |
|
1427 * Implements a replace operation intended to be used as part of an |
|
1428 * incremental find-and-replace. |
|
1429 * |
|
1430 * <p>The input string, starting from the end of the previous replacement and ending at |
|
1431 * the start of the current match, is appended to the destination string. Then the |
|
1432 * replacement string is appended to the output string, |
|
1433 * including handling any substitutions of captured text.</p> |
|
1434 * |
|
1435 * <p>For simple, prepackaged, non-incremental find-and-replace |
|
1436 * operations, see replaceFirst() or replaceAll().</p> |
|
1437 * |
|
1438 * @param dest A UnicodeString to which the results of the find-and-replace are appended. |
|
1439 * @param replacement A UnicodeString that provides the text to be substituted for |
|
1440 * the input text that matched the regexp pattern. The replacement |
|
1441 * text may contain references to captured text from the |
|
1442 * input. |
|
1443 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
1444 * errors are U_REGEX_INVALID_STATE if no match has been |
|
1445 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR |
|
1446 * if the replacement text specifies a capture group that |
|
1447 * does not exist in the pattern. |
|
1448 * |
|
1449 * @return this RegexMatcher |
|
1450 * @stable ICU 2.4 |
|
1451 * |
|
1452 */ |
|
1453 virtual RegexMatcher &appendReplacement(UnicodeString &dest, |
|
1454 const UnicodeString &replacement, UErrorCode &status); |
|
1455 |
|
1456 |
|
1457 /** |
|
1458 * Implements a replace operation intended to be used as part of an |
|
1459 * incremental find-and-replace. |
|
1460 * |
|
1461 * <p>The input string, starting from the end of the previous replacement and ending at |
|
1462 * the start of the current match, is appended to the destination string. Then the |
|
1463 * replacement string is appended to the output string, |
|
1464 * including handling any substitutions of captured text.</p> |
|
1465 * |
|
1466 * <p>For simple, prepackaged, non-incremental find-and-replace |
|
1467 * operations, see replaceFirst() or replaceAll().</p> |
|
1468 * |
|
1469 * @param dest A mutable UText to which the results of the find-and-replace are appended. |
|
1470 * Must not be NULL. |
|
1471 * @param replacement A UText that provides the text to be substituted for |
|
1472 * the input text that matched the regexp pattern. The replacement |
|
1473 * text may contain references to captured text from the input. |
|
1474 * @param status A reference to a UErrorCode to receive any errors. Possible |
|
1475 * errors are U_REGEX_INVALID_STATE if no match has been |
|
1476 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR |
|
1477 * if the replacement text specifies a capture group that |
|
1478 * does not exist in the pattern. |
|
1479 * |
|
1480 * @return this RegexMatcher |
|
1481 * |
|
1482 * @stable ICU 4.6 |
|
1483 */ |
|
1484 virtual RegexMatcher &appendReplacement(UText *dest, |
|
1485 UText *replacement, UErrorCode &status); |
|
1486 |
|
1487 |
|
1488 /** |
|
1489 * As the final step in a find-and-replace operation, append the remainder |
|
1490 * of the input string, starting at the position following the last appendReplacement(), |
|
1491 * to the destination string. <code>appendTail()</code> is intended to be invoked after one |
|
1492 * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. |
|
1493 * |
|
1494 * @param dest A UnicodeString to which the results of the find-and-replace are appended. |
|
1495 * @return the destination string. |
|
1496 * @stable ICU 2.4 |
|
1497 */ |
|
1498 virtual UnicodeString &appendTail(UnicodeString &dest); |
|
1499 |
|
1500 |
|
1501 /** |
|
1502 * As the final step in a find-and-replace operation, append the remainder |
|
1503 * of the input string, starting at the position following the last appendReplacement(), |
|
1504 * to the destination string. <code>appendTail()</code> is intended to be invoked after one |
|
1505 * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. |
|
1506 * |
|
1507 * @param dest A mutable UText to which the results of the find-and-replace are appended. |
|
1508 * Must not be NULL. |
|
1509 * @param status error cod |
|
1510 * @return the destination string. |
|
1511 * |
|
1512 * @stable ICU 4.6 |
|
1513 */ |
|
1514 virtual UText *appendTail(UText *dest, UErrorCode &status); |
|
1515 |
|
1516 |
|
1517 /** |
|
1518 * Split a string into fields. Somewhat like split() from Perl. |
|
1519 * The pattern matches identify delimiters that separate the input |
|
1520 * into fields. The input data between the matches becomes the |
|
1521 * fields themselves. |
|
1522 * |
|
1523 * @param input The string to be split into fields. The field delimiters |
|
1524 * match the pattern (in the "this" object). This matcher |
|
1525 * will be reset to this input string. |
|
1526 * @param dest An array of UnicodeStrings to receive the results of the split. |
|
1527 * This is an array of actual UnicodeString objects, not an |
|
1528 * array of pointers to strings. Local (stack based) arrays can |
|
1529 * work well here. |
|
1530 * @param destCapacity The number of elements in the destination array. |
|
1531 * If the number of fields found is less than destCapacity, the |
|
1532 * extra strings in the destination array are not altered. |
|
1533 * If the number of destination strings is less than the number |
|
1534 * of fields, the trailing part of the input string, including any |
|
1535 * field delimiters, is placed in the last destination string. |
|
1536 * @param status A reference to a UErrorCode to receive any errors. |
|
1537 * @return The number of fields into which the input string was split. |
|
1538 * @stable ICU 2.6 |
|
1539 */ |
|
1540 virtual int32_t split(const UnicodeString &input, |
|
1541 UnicodeString dest[], |
|
1542 int32_t destCapacity, |
|
1543 UErrorCode &status); |
|
1544 |
|
1545 |
|
1546 /** |
|
1547 * Split a string into fields. Somewhat like split() from Perl. |
|
1548 * The pattern matches identify delimiters that separate the input |
|
1549 * into fields. The input data between the matches becomes the |
|
1550 * fields themselves. |
|
1551 * |
|
1552 * @param input The string to be split into fields. The field delimiters |
|
1553 * match the pattern (in the "this" object). This matcher |
|
1554 * will be reset to this input string. |
|
1555 * @param dest An array of mutable UText structs to receive the results of the split. |
|
1556 * If a field is NULL, a new UText is allocated to contain the results for |
|
1557 * that field. This new UText is not guaranteed to be mutable. |
|
1558 * @param destCapacity The number of elements in the destination array. |
|
1559 * If the number of fields found is less than destCapacity, the |
|
1560 * extra strings in the destination array are not altered. |
|
1561 * If the number of destination strings is less than the number |
|
1562 * of fields, the trailing part of the input string, including any |
|
1563 * field delimiters, is placed in the last destination string. |
|
1564 * @param status A reference to a UErrorCode to receive any errors. |
|
1565 * @return The number of fields into which the input string was split. |
|
1566 * |
|
1567 * @stable ICU 4.6 |
|
1568 */ |
|
1569 virtual int32_t split(UText *input, |
|
1570 UText *dest[], |
|
1571 int32_t destCapacity, |
|
1572 UErrorCode &status); |
|
1573 |
|
1574 /** |
|
1575 * Set a processing time limit for match operations with this Matcher. |
|
1576 * |
|
1577 * Some patterns, when matching certain strings, can run in exponential time. |
|
1578 * For practical purposes, the match operation may appear to be in an |
|
1579 * infinite loop. |
|
1580 * When a limit is set a match operation will fail with an error if the |
|
1581 * limit is exceeded. |
|
1582 * <p> |
|
1583 * The units of the limit are steps of the match engine. |
|
1584 * Correspondence with actual processor time will depend on the speed |
|
1585 * of the processor and the details of the specific pattern, but will |
|
1586 * typically be on the order of milliseconds. |
|
1587 * <p> |
|
1588 * By default, the matching time is not limited. |
|
1589 * <p> |
|
1590 * |
|
1591 * @param limit The limit value, or 0 for no limit. |
|
1592 * @param status A reference to a UErrorCode to receive any errors. |
|
1593 * @stable ICU 4.0 |
|
1594 */ |
|
1595 virtual void setTimeLimit(int32_t limit, UErrorCode &status); |
|
1596 |
|
1597 /** |
|
1598 * Get the time limit, if any, for match operations made with this Matcher. |
|
1599 * |
|
1600 * @return the maximum allowed time for a match, in units of processing steps. |
|
1601 * @stable ICU 4.0 |
|
1602 */ |
|
1603 virtual int32_t getTimeLimit() const; |
|
1604 |
|
1605 /** |
|
1606 * Set the amount of heap storage available for use by the match backtracking stack. |
|
1607 * The matcher is also reset, discarding any results from previous matches. |
|
1608 * <p> |
|
1609 * ICU uses a backtracking regular expression engine, with the backtrack stack |
|
1610 * maintained on the heap. This function sets the limit to the amount of memory |
|
1611 * that can be used for this purpose. A backtracking stack overflow will |
|
1612 * result in an error from the match operation that caused it. |
|
1613 * <p> |
|
1614 * A limit is desirable because a malicious or poorly designed pattern can use |
|
1615 * excessive memory, potentially crashing the process. A limit is enabled |
|
1616 * by default. |
|
1617 * <p> |
|
1618 * @param limit The maximum size, in bytes, of the matching backtrack stack. |
|
1619 * A value of zero means no limit. |
|
1620 * The limit must be greater or equal to zero. |
|
1621 * |
|
1622 * @param status A reference to a UErrorCode to receive any errors. |
|
1623 * |
|
1624 * @stable ICU 4.0 |
|
1625 */ |
|
1626 virtual void setStackLimit(int32_t limit, UErrorCode &status); |
|
1627 |
|
1628 /** |
|
1629 * Get the size of the heap storage available for use by the back tracking stack. |
|
1630 * |
|
1631 * @return the maximum backtracking stack size, in bytes, or zero if the |
|
1632 * stack size is unlimited. |
|
1633 * @stable ICU 4.0 |
|
1634 */ |
|
1635 virtual int32_t getStackLimit() const; |
|
1636 |
|
1637 |
|
1638 /** |
|
1639 * Set a callback function for use with this Matcher. |
|
1640 * During matching operations the function will be called periodically, |
|
1641 * giving the application the opportunity to terminate a long-running |
|
1642 * match. |
|
1643 * |
|
1644 * @param callback A pointer to the user-supplied callback function. |
|
1645 * @param context User context pointer. The value supplied at the |
|
1646 * time the callback function is set will be saved |
|
1647 * and passed to the callback each time that it is called. |
|
1648 * @param status A reference to a UErrorCode to receive any errors. |
|
1649 * @stable ICU 4.0 |
|
1650 */ |
|
1651 virtual void setMatchCallback(URegexMatchCallback *callback, |
|
1652 const void *context, |
|
1653 UErrorCode &status); |
|
1654 |
|
1655 |
|
1656 /** |
|
1657 * Get the callback function for this URegularExpression. |
|
1658 * |
|
1659 * @param callback Out parameter, receives a pointer to the user-supplied |
|
1660 * callback function. |
|
1661 * @param context Out parameter, receives the user context pointer that |
|
1662 * was set when uregex_setMatchCallback() was called. |
|
1663 * @param status A reference to a UErrorCode to receive any errors. |
|
1664 * @stable ICU 4.0 |
|
1665 */ |
|
1666 virtual void getMatchCallback(URegexMatchCallback *&callback, |
|
1667 const void *&context, |
|
1668 UErrorCode &status); |
|
1669 |
|
1670 |
|
1671 /** |
|
1672 * Set a progress callback function for use with find operations on this Matcher. |
|
1673 * During find operations, the callback will be invoked after each return from a |
|
1674 * match attempt, giving the application the opportunity to terminate a long-running |
|
1675 * find operation. |
|
1676 * |
|
1677 * @param callback A pointer to the user-supplied callback function. |
|
1678 * @param context User context pointer. The value supplied at the |
|
1679 * time the callback function is set will be saved |
|
1680 * and passed to the callback each time that it is called. |
|
1681 * @param status A reference to a UErrorCode to receive any errors. |
|
1682 * @stable ICU 4.6 |
|
1683 */ |
|
1684 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, |
|
1685 const void *context, |
|
1686 UErrorCode &status); |
|
1687 |
|
1688 |
|
1689 /** |
|
1690 * Get the find progress callback function for this URegularExpression. |
|
1691 * |
|
1692 * @param callback Out parameter, receives a pointer to the user-supplied |
|
1693 * callback function. |
|
1694 * @param context Out parameter, receives the user context pointer that |
|
1695 * was set when uregex_setFindProgressCallback() was called. |
|
1696 * @param status A reference to a UErrorCode to receive any errors. |
|
1697 * @stable ICU 4.6 |
|
1698 */ |
|
1699 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, |
|
1700 const void *&context, |
|
1701 UErrorCode &status); |
|
1702 |
|
1703 #ifndef U_HIDE_INTERNAL_API |
|
1704 /** |
|
1705 * setTrace Debug function, enable/disable tracing of the matching engine. |
|
1706 * For internal ICU development use only. DO NO USE!!!! |
|
1707 * @internal |
|
1708 */ |
|
1709 void setTrace(UBool state); |
|
1710 #endif /* U_HIDE_INTERNAL_API */ |
|
1711 |
|
1712 /** |
|
1713 * ICU "poor man's RTTI", returns a UClassID for this class. |
|
1714 * |
|
1715 * @stable ICU 2.2 |
|
1716 */ |
|
1717 static UClassID U_EXPORT2 getStaticClassID(); |
|
1718 |
|
1719 /** |
|
1720 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
|
1721 * |
|
1722 * @stable ICU 2.2 |
|
1723 */ |
|
1724 virtual UClassID getDynamicClassID() const; |
|
1725 |
|
1726 private: |
|
1727 // Constructors and other object boilerplate are private. |
|
1728 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. |
|
1729 RegexMatcher(); // default constructor not implemented |
|
1730 RegexMatcher(const RegexPattern *pat); |
|
1731 RegexMatcher(const RegexMatcher &other); |
|
1732 RegexMatcher &operator =(const RegexMatcher &rhs); |
|
1733 void init(UErrorCode &status); // Common initialization |
|
1734 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. |
|
1735 |
|
1736 friend class RegexPattern; |
|
1737 friend class RegexCImpl; |
|
1738 public: |
|
1739 #ifndef U_HIDE_INTERNAL_API |
|
1740 /** @internal */ |
|
1741 void resetPreserveRegion(); // Reset matcher state, but preserve any region. |
|
1742 #endif /* U_HIDE_INTERNAL_API */ |
|
1743 private: |
|
1744 |
|
1745 // |
|
1746 // MatchAt This is the internal interface to the match engine itself. |
|
1747 // Match status comes back in matcher member variables. |
|
1748 // |
|
1749 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); |
|
1750 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); |
|
1751 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test |
|
1752 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test |
|
1753 REStackFrame *resetStack(); |
|
1754 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); |
|
1755 void IncrementTime(UErrorCode &status); |
|
1756 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); |
|
1757 |
|
1758 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; |
|
1759 |
|
1760 UBool findUsingChunk(); |
|
1761 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); |
|
1762 UBool isChunkWordBoundary(int32_t pos); |
|
1763 |
|
1764 const RegexPattern *fPattern; |
|
1765 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and |
|
1766 // should delete it when through. |
|
1767 |
|
1768 const UnicodeString *fInput; // The string being matched. Only used for input() |
|
1769 UText *fInputText; // The text being matched. Is never NULL. |
|
1770 UText *fAltInputText; // A shallow copy of the text being matched. |
|
1771 // Only created if the pattern contains backreferences. |
|
1772 int64_t fInputLength; // Full length of the input text. |
|
1773 int32_t fFrameSize; // The size of a frame in the backtrack stack. |
|
1774 |
|
1775 int64_t fRegionStart; // Start of the input region, default = 0. |
|
1776 int64_t fRegionLimit; // End of input region, default to input.length. |
|
1777 |
|
1778 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). |
|
1779 int64_t fAnchorLimit; // See useAnchoringBounds |
|
1780 |
|
1781 int64_t fLookStart; // Region bounds for look-ahead/behind and |
|
1782 int64_t fLookLimit; // and other boundary tests. See |
|
1783 // useTransparentBounds |
|
1784 |
|
1785 int64_t fActiveStart; // Currently active bounds for matching. |
|
1786 int64_t fActiveLimit; // Usually is the same as region, but |
|
1787 // is changed to fLookStart/Limit when |
|
1788 // entering look around regions. |
|
1789 |
|
1790 UBool fTransparentBounds; // True if using transparent bounds. |
|
1791 UBool fAnchoringBounds; // True if using anchoring bounds. |
|
1792 |
|
1793 UBool fMatch; // True if the last attempted match was successful. |
|
1794 int64_t fMatchStart; // Position of the start of the most recent match |
|
1795 int64_t fMatchEnd; // First position after the end of the most recent match |
|
1796 // Zero if no previous match, even when a region |
|
1797 // is active. |
|
1798 int64_t fLastMatchEnd; // First position after the end of the previous match, |
|
1799 // or -1 if there was no previous match. |
|
1800 int64_t fAppendPosition; // First position after the end of the previous |
|
1801 // appendReplacement(). As described by the |
|
1802 // JavaDoc for Java Matcher, where it is called |
|
1803 // "append position" |
|
1804 UBool fHitEnd; // True if the last match touched the end of input. |
|
1805 UBool fRequireEnd; // True if the last match required end-of-input |
|
1806 // (matched $ or Z) |
|
1807 |
|
1808 UVector64 *fStack; |
|
1809 REStackFrame *fFrame; // After finding a match, the last active stack frame, |
|
1810 // which will contain the capture group results. |
|
1811 // NOT valid while match engine is running. |
|
1812 |
|
1813 int64_t *fData; // Data area for use by the compiled pattern. |
|
1814 int64_t fSmallData[8]; // Use this for data if it's enough. |
|
1815 |
|
1816 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the |
|
1817 // match engine run. Zero for unlimited. |
|
1818 |
|
1819 int32_t fTime; // Match time, accumulates while matching. |
|
1820 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. |
|
1821 // Kept separately from fTime to keep as much |
|
1822 // code as possible out of the inline |
|
1823 // StateSave function. |
|
1824 |
|
1825 int32_t fStackLimit; // Maximum memory size to use for the backtrack |
|
1826 // stack, in bytes. Zero for unlimited. |
|
1827 |
|
1828 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. |
|
1829 // NULL if there is no callback. |
|
1830 const void *fCallbackContext; // User Context ptr for callback function. |
|
1831 |
|
1832 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. |
|
1833 // NULL if there is no callback. |
|
1834 const void *fFindProgressCallbackContext; // User Context ptr for callback function. |
|
1835 |
|
1836 |
|
1837 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. |
|
1838 |
|
1839 UBool fTraceDebug; // Set true for debug tracing of match engine. |
|
1840 |
|
1841 UErrorCode fDeferredStatus; // Save error state that cannot be immediately |
|
1842 // reported, or that permanently disables this matcher. |
|
1843 |
|
1844 RuleBasedBreakIterator *fWordBreakItr; |
|
1845 }; |
|
1846 |
|
1847 U_NAMESPACE_END |
|
1848 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS |
|
1849 #endif |