1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/unicode/uregex.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1591 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2004-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: uregex.h 1.10 +* encoding: US-ASCII 1.11 +* indentation:4 1.12 +* 1.13 +* created on: 2004mar09 1.14 +* created by: Andy Heninger 1.15 +* 1.16 +* ICU Regular Expressions, API for C 1.17 +*/ 1.18 + 1.19 +/** 1.20 + * \file 1.21 + * \brief C API: Regular Expressions 1.22 + * 1.23 + * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 1.24 + */ 1.25 + 1.26 +#ifndef UREGEX_H 1.27 +#define UREGEX_H 1.28 + 1.29 +#include "unicode/utext.h" 1.30 +#include "unicode/utypes.h" 1.31 + 1.32 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.33 + 1.34 +#include "unicode/localpointer.h" 1.35 +#include "unicode/parseerr.h" 1.36 + 1.37 +struct URegularExpression; 1.38 +/** 1.39 + * Structure representing a compiled regular expression, plus the results 1.40 + * of a match operation. 1.41 + * @stable ICU 3.0 1.42 + */ 1.43 +typedef struct URegularExpression URegularExpression; 1.44 + 1.45 + 1.46 +/** 1.47 + * Constants for Regular Expression Match Modes. 1.48 + * @stable ICU 2.4 1.49 + */ 1.50 +typedef enum URegexpFlag{ 1.51 + 1.52 +#ifndef U_HIDE_DRAFT_API 1.53 + /** Forces normalization of pattern and strings. 1.54 + Not implemented yet, just a placeholder, hence draft. 1.55 + @draft ICU 2.4 */ 1.56 + UREGEX_CANON_EQ = 128, 1.57 +#endif /* U_HIDE_DRAFT_API */ 1.58 + /** Enable case insensitive matching. @stable ICU 2.4 */ 1.59 + UREGEX_CASE_INSENSITIVE = 2, 1.60 + 1.61 + /** Allow white space and comments within patterns @stable ICU 2.4 */ 1.62 + UREGEX_COMMENTS = 4, 1.63 + 1.64 + /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 1.65 + * @stable ICU 2.4 */ 1.66 + UREGEX_DOTALL = 32, 1.67 + 1.68 + /** If set, treat the entire pattern as a literal string. 1.69 + * Metacharacters or escape sequences in the input sequence will be given 1.70 + * no special meaning. 1.71 + * 1.72 + * The flag UREGEX_CASE_INSENSITIVE retains its impact 1.73 + * on matching when used in conjunction with this flag. 1.74 + * The other flags become superfluous. 1.75 + * 1.76 + * @stable ICU 4.0 1.77 + */ 1.78 + UREGEX_LITERAL = 16, 1.79 + 1.80 + /** Control behavior of "$" and "^" 1.81 + * If set, recognize line terminators within string, 1.82 + * otherwise, match only at start and end of input string. 1.83 + * @stable ICU 2.4 */ 1.84 + UREGEX_MULTILINE = 8, 1.85 + 1.86 + /** Unix-only line endings. 1.87 + * When this mode is enabled, only \\u000a is recognized as a line ending 1.88 + * in the behavior of ., ^, and $. 1.89 + * @stable ICU 4.0 1.90 + */ 1.91 + UREGEX_UNIX_LINES = 1, 1.92 + 1.93 + /** Unicode word boundaries. 1.94 + * If set, \b uses the Unicode TR 29 definition of word boundaries. 1.95 + * Warning: Unicode word boundaries are quite different from 1.96 + * traditional regular expression word boundaries. See 1.97 + * http://unicode.org/reports/tr29/#Word_Boundaries 1.98 + * @stable ICU 2.8 1.99 + */ 1.100 + UREGEX_UWORD = 256, 1.101 + 1.102 + /** Error on Unrecognized backslash escapes. 1.103 + * If set, fail with an error on patterns that contain 1.104 + * backslash-escaped ASCII letters without a known special 1.105 + * meaning. If this flag is not set, these 1.106 + * escaped letters represent themselves. 1.107 + * @stable ICU 4.0 1.108 + */ 1.109 + UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 1.110 + 1.111 +} URegexpFlag; 1.112 + 1.113 +/** 1.114 + * Open (compile) an ICU regular expression. Compiles the regular expression in 1.115 + * string form into an internal representation using the specified match mode flags. 1.116 + * The resulting regular expression handle can then be used to perform various 1.117 + * matching operations. 1.118 + * 1.119 + * 1.120 + * @param pattern The Regular Expression pattern to be compiled. 1.121 + * @param patternLength The length of the pattern, or -1 if the pattern is 1.122 + * NUL terminated. 1.123 + * @param flags Flags that alter the default matching behavior for 1.124 + * the regular expression, UREGEX_CASE_INSENSITIVE, for 1.125 + * example. For default behavior, set this parameter to zero. 1.126 + * See <code>enum URegexpFlag</code>. All desired flags 1.127 + * are bitwise-ORed together. 1.128 + * @param pe Receives the position (line and column numbers) of any syntax 1.129 + * error within the source regular expression string. If this 1.130 + * information is not wanted, pass NULL for this parameter. 1.131 + * @param status Receives error detected by this function. 1.132 + * @stable ICU 3.0 1.133 + * 1.134 + */ 1.135 +U_STABLE URegularExpression * U_EXPORT2 1.136 +uregex_open( const UChar *pattern, 1.137 + int32_t patternLength, 1.138 + uint32_t flags, 1.139 + UParseError *pe, 1.140 + UErrorCode *status); 1.141 + 1.142 +/** 1.143 + * Open (compile) an ICU regular expression. Compiles the regular expression in 1.144 + * string form into an internal representation using the specified match mode flags. 1.145 + * The resulting regular expression handle can then be used to perform various 1.146 + * matching operations. 1.147 + * <p> 1.148 + * The contents of the pattern UText will be extracted and saved. Ownership of the 1.149 + * UText struct itself remains with the caller. This is to match the behavior of 1.150 + * uregex_open(). 1.151 + * 1.152 + * @param pattern The Regular Expression pattern to be compiled. 1.153 + * @param flags Flags that alter the default matching behavior for 1.154 + * the regular expression, UREGEX_CASE_INSENSITIVE, for 1.155 + * example. For default behavior, set this parameter to zero. 1.156 + * See <code>enum URegexpFlag</code>. All desired flags 1.157 + * are bitwise-ORed together. 1.158 + * @param pe Receives the position (line and column numbers) of any syntax 1.159 + * error within the source regular expression string. If this 1.160 + * information is not wanted, pass NULL for this parameter. 1.161 + * @param status Receives error detected by this function. 1.162 + * 1.163 + * @stable ICU 4.6 1.164 + */ 1.165 +U_STABLE URegularExpression * U_EXPORT2 1.166 +uregex_openUText(UText *pattern, 1.167 + uint32_t flags, 1.168 + UParseError *pe, 1.169 + UErrorCode *status); 1.170 + 1.171 +/** 1.172 + * Open (compile) an ICU regular expression. The resulting regular expression 1.173 + * handle can then be used to perform various matching operations. 1.174 + * <p> 1.175 + * This function is the same as uregex_open, except that the pattern 1.176 + * is supplied as an 8 bit char * string in the default code page. 1.177 + * 1.178 + * @param pattern The Regular Expression pattern to be compiled, 1.179 + * NUL terminated. 1.180 + * @param flags Flags that alter the default matching behavior for 1.181 + * the regular expression, UREGEX_CASE_INSENSITIVE, for 1.182 + * example. For default behavior, set this parameter to zero. 1.183 + * See <code>enum URegexpFlag</code>. All desired flags 1.184 + * are bitwise-ORed together. 1.185 + * @param pe Receives the position (line and column numbers) of any syntax 1.186 + * error within the source regular expression string. If this 1.187 + * information is not wanted, pass NULL for this parameter. 1.188 + * @param status Receives errors detected by this function. 1.189 + * @return The URegularExpression object representing the compiled 1.190 + * pattern. 1.191 + * 1.192 + * @stable ICU 3.0 1.193 + */ 1.194 +#if !UCONFIG_NO_CONVERSION 1.195 +U_STABLE URegularExpression * U_EXPORT2 1.196 +uregex_openC( const char *pattern, 1.197 + uint32_t flags, 1.198 + UParseError *pe, 1.199 + UErrorCode *status); 1.200 +#endif 1.201 + 1.202 + 1.203 + 1.204 +/** 1.205 + * Close the regular expression, recovering all resources (memory) it 1.206 + * was holding. 1.207 + * 1.208 + * @param regexp The regular expression to be closed. 1.209 + * @stable ICU 3.0 1.210 + */ 1.211 +U_STABLE void U_EXPORT2 1.212 +uregex_close(URegularExpression *regexp); 1.213 + 1.214 +#if U_SHOW_CPLUSPLUS_API 1.215 + 1.216 +U_NAMESPACE_BEGIN 1.217 + 1.218 +/** 1.219 + * \class LocalURegularExpressionPointer 1.220 + * "Smart pointer" class, closes a URegularExpression via uregex_close(). 1.221 + * For most methods see the LocalPointerBase base class. 1.222 + * 1.223 + * @see LocalPointerBase 1.224 + * @see LocalPointer 1.225 + * @stable ICU 4.4 1.226 + */ 1.227 +U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 1.228 + 1.229 +U_NAMESPACE_END 1.230 + 1.231 +#endif 1.232 + 1.233 +/** 1.234 + * Make a copy of a compiled regular expression. Cloning a regular 1.235 + * expression is faster than opening a second instance from the source 1.236 + * form of the expression, and requires less memory. 1.237 + * <p> 1.238 + * Note that the current input string and the position of any matched text 1.239 + * within it are not cloned; only the pattern itself and the 1.240 + * match mode flags are copied. 1.241 + * <p> 1.242 + * Cloning can be particularly useful to threaded applications that perform 1.243 + * multiple match operations in parallel. Each concurrent RE 1.244 + * operation requires its own instance of a URegularExpression. 1.245 + * 1.246 + * @param regexp The compiled regular expression to be cloned. 1.247 + * @param status Receives indication of any errors encountered 1.248 + * @return the cloned copy of the compiled regular expression. 1.249 + * @stable ICU 3.0 1.250 + */ 1.251 +U_STABLE URegularExpression * U_EXPORT2 1.252 +uregex_clone(const URegularExpression *regexp, UErrorCode *status); 1.253 + 1.254 +/** 1.255 + * Returns a pointer to the source form of the pattern for this regular expression. 1.256 + * This function will work even if the pattern was originally specified as a UText. 1.257 + * 1.258 + * @param regexp The compiled regular expression. 1.259 + * @param patLength This output parameter will be set to the length of the 1.260 + * pattern string. A NULL pointer may be used here if the 1.261 + * pattern length is not needed, as would be the case if 1.262 + * the pattern is known in advance to be a NUL terminated 1.263 + * string. 1.264 + * @param status Receives errors detected by this function. 1.265 + * @return a pointer to the pattern string. The storage for the string is 1.266 + * owned by the regular expression object, and must not be 1.267 + * altered or deleted by the application. The returned string 1.268 + * will remain valid until the regular expression is closed. 1.269 + * @stable ICU 3.0 1.270 + */ 1.271 +U_STABLE const UChar * U_EXPORT2 1.272 +uregex_pattern(const URegularExpression *regexp, 1.273 + int32_t *patLength, 1.274 + UErrorCode *status); 1.275 + 1.276 +/** 1.277 + * Returns the source text of the pattern for this regular expression. 1.278 + * This function will work even if the pattern was originally specified as a UChar string. 1.279 + * 1.280 + * @param regexp The compiled regular expression. 1.281 + * @param status Receives errors detected by this function. 1.282 + * @return the pattern text. The storage for the text is owned by the regular expression 1.283 + * object, and must not be altered or deleted. 1.284 + * 1.285 + * @stable ICU 4.6 1.286 + */ 1.287 +U_STABLE UText * U_EXPORT2 1.288 +uregex_patternUText(const URegularExpression *regexp, 1.289 + UErrorCode *status); 1.290 + 1.291 +/** 1.292 + * Get the match mode flags that were specified when compiling this regular expression. 1.293 + * @param status Receives errors detected by this function. 1.294 + * @param regexp The compiled regular expression. 1.295 + * @return The match mode flags 1.296 + * @see URegexpFlag 1.297 + * @stable ICU 3.0 1.298 + */ 1.299 +U_STABLE int32_t U_EXPORT2 1.300 +uregex_flags(const URegularExpression *regexp, 1.301 + UErrorCode *status); 1.302 + 1.303 + 1.304 +/** 1.305 + * Set the subject text string upon which the regular expression will look for matches. 1.306 + * This function may be called any number of times, allowing the regular 1.307 + * expression pattern to be applied to different strings. 1.308 + * <p> 1.309 + * Regular expression matching operations work directly on the application's 1.310 + * string data. No copy is made. The subject string data must not be 1.311 + * altered after calling this function until after all regular expression 1.312 + * operations involving this string data are completed. 1.313 + * <p> 1.314 + * Zero length strings are permitted. In this case, no subsequent match 1.315 + * operation will dereference the text string pointer. 1.316 + * 1.317 + * @param regexp The compiled regular expression. 1.318 + * @param text The subject text string. 1.319 + * @param textLength The length of the subject text, or -1 if the string 1.320 + * is NUL terminated. 1.321 + * @param status Receives errors detected by this function. 1.322 + * @stable ICU 3.0 1.323 + */ 1.324 +U_STABLE void U_EXPORT2 1.325 +uregex_setText(URegularExpression *regexp, 1.326 + const UChar *text, 1.327 + int32_t textLength, 1.328 + UErrorCode *status); 1.329 + 1.330 + 1.331 +/** 1.332 + * Set the subject text string upon which the regular expression will look for matches. 1.333 + * This function may be called any number of times, allowing the regular 1.334 + * expression pattern to be applied to different strings. 1.335 + * <p> 1.336 + * Regular expression matching operations work directly on the application's 1.337 + * string data; only a shallow clone is made. The subject string data must not be 1.338 + * altered after calling this function until after all regular expression 1.339 + * operations involving this string data are completed. 1.340 + * 1.341 + * @param regexp The compiled regular expression. 1.342 + * @param text The subject text string. 1.343 + * @param status Receives errors detected by this function. 1.344 + * 1.345 + * @stable ICU 4.6 1.346 + */ 1.347 +U_STABLE void U_EXPORT2 1.348 +uregex_setUText(URegularExpression *regexp, 1.349 + UText *text, 1.350 + UErrorCode *status); 1.351 + 1.352 +/** 1.353 + * Get the subject text that is currently associated with this 1.354 + * regular expression object. If the input was supplied using uregex_setText(), 1.355 + * that pointer will be returned. Otherwise, the characters in the input will 1.356 + * be extracted to a buffer and returned. In either case, ownership remains 1.357 + * with the regular expression object. 1.358 + * 1.359 + * This function will work even if the input was originally specified as a UText. 1.360 + * 1.361 + * @param regexp The compiled regular expression. 1.362 + * @param textLength The length of the string is returned in this output parameter. 1.363 + * A NULL pointer may be used here if the 1.364 + * text length is not needed, as would be the case if 1.365 + * the text is known in advance to be a NUL terminated 1.366 + * string. 1.367 + * @param status Receives errors detected by this function. 1.368 + * @return Pointer to the subject text string currently associated with 1.369 + * this regular expression. 1.370 + * @stable ICU 3.0 1.371 + */ 1.372 +U_STABLE const UChar * U_EXPORT2 1.373 +uregex_getText(URegularExpression *regexp, 1.374 + int32_t *textLength, 1.375 + UErrorCode *status); 1.376 + 1.377 +/** 1.378 + * Get the subject text that is currently associated with this 1.379 + * regular expression object. 1.380 + * 1.381 + * This function will work even if the input was originally specified as a UChar string. 1.382 + * 1.383 + * @param regexp The compiled regular expression. 1.384 + * @param dest A mutable UText in which to store the current input. 1.385 + * If NULL, a new UText will be created as an immutable shallow clone 1.386 + * of the actual input string. 1.387 + * @param status Receives errors detected by this function. 1.388 + * @return The subject text currently associated with this regular expression. 1.389 + * If a pre-allocated UText was provided, it will always be used and returned. 1.390 + * 1.391 + * @stable ICU 4.6 1.392 + */ 1.393 +U_STABLE UText * U_EXPORT2 1.394 +uregex_getUText(URegularExpression *regexp, 1.395 + UText *dest, 1.396 + UErrorCode *status); 1.397 + 1.398 +/** 1.399 + * Set the subject text string upon which the regular expression is looking for matches 1.400 + * without changing any other aspect of the matching state. 1.401 + * The new and previous text strings must have the same content. 1.402 + * 1.403 + * This function is intended for use in environments where ICU is operating on 1.404 + * strings that may move around in memory. It provides a mechanism for notifying 1.405 + * ICU that the string has been relocated, and providing a new UText to access the 1.406 + * string in its new position. 1.407 + * 1.408 + * Note that the regular expression implementation never copies the underlying text 1.409 + * of a string being matched, but always operates directly on the original text 1.410 + * provided by the user. Refreshing simply drops the references to the old text 1.411 + * and replaces them with references to the new. 1.412 + * 1.413 + * Caution: this function is normally used only by very specialized 1.414 + * system-level code. One example use case is with garbage collection 1.415 + * that moves the text in memory. 1.416 + * 1.417 + * @param regexp The compiled regular expression. 1.418 + * @param text The new (moved) text string. 1.419 + * @param status Receives errors detected by this function. 1.420 + * 1.421 + * @stable ICU 4.8 1.422 + */ 1.423 +U_STABLE void U_EXPORT2 1.424 +uregex_refreshUText(URegularExpression *regexp, 1.425 + UText *text, 1.426 + UErrorCode *status); 1.427 + 1.428 +/** 1.429 + * Attempts to match the input string against the pattern. 1.430 + * To succeed, the match must extend to the end of the string, 1.431 + * or cover the complete match region. 1.432 + * 1.433 + * If startIndex >= zero the match operation starts at the specified 1.434 + * index and must extend to the end of the input string. Any region 1.435 + * that has been specified is reset. 1.436 + * 1.437 + * If startIndex == -1 the match must cover the input region, or the entire 1.438 + * input string if no region has been set. This directly corresponds to 1.439 + * Matcher.matches() in Java 1.440 + * 1.441 + * @param regexp The compiled regular expression. 1.442 + * @param startIndex The input string (native) index at which to begin matching, or -1 1.443 + * to match the input Region. 1.444 + * @param status Receives errors detected by this function. 1.445 + * @return TRUE if there is a match 1.446 + * @stable ICU 3.0 1.447 + */ 1.448 +U_STABLE UBool U_EXPORT2 1.449 +uregex_matches(URegularExpression *regexp, 1.450 + int32_t startIndex, 1.451 + UErrorCode *status); 1.452 + 1.453 +/** 1.454 + * 64bit version of uregex_matches. 1.455 + * Attempts to match the input string against the pattern. 1.456 + * To succeed, the match must extend to the end of the string, 1.457 + * or cover the complete match region. 1.458 + * 1.459 + * If startIndex >= zero the match operation starts at the specified 1.460 + * index and must extend to the end of the input string. Any region 1.461 + * that has been specified is reset. 1.462 + * 1.463 + * If startIndex == -1 the match must cover the input region, or the entire 1.464 + * input string if no region has been set. This directly corresponds to 1.465 + * Matcher.matches() in Java 1.466 + * 1.467 + * @param regexp The compiled regular expression. 1.468 + * @param startIndex The input string (native) index at which to begin matching, or -1 1.469 + * to match the input Region. 1.470 + * @param status Receives errors detected by this function. 1.471 + * @return TRUE if there is a match 1.472 + * @stable ICU 4.6 1.473 + */ 1.474 +U_STABLE UBool U_EXPORT2 1.475 +uregex_matches64(URegularExpression *regexp, 1.476 + int64_t startIndex, 1.477 + UErrorCode *status); 1.478 + 1.479 +/** 1.480 + * Attempts to match the input string, starting from the specified index, against the pattern. 1.481 + * The match may be of any length, and is not required to extend to the end 1.482 + * of the input string. Contrast with uregex_matches(). 1.483 + * 1.484 + * <p>If startIndex is >= 0 any input region that was set for this 1.485 + * URegularExpression is reset before the operation begins. 1.486 + * 1.487 + * <p>If the specified starting index == -1 the match begins at the start of the input 1.488 + * region, or at the start of the full string if no region has been specified. 1.489 + * This corresponds directly with Matcher.lookingAt() in Java. 1.490 + * 1.491 + * <p>If the match succeeds then more information can be obtained via the 1.492 + * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 1.493 + * and <code>uregexp_group()</code> functions.</p> 1.494 + * 1.495 + * @param regexp The compiled regular expression. 1.496 + * @param startIndex The input string (native) index at which to begin matching, or 1.497 + * -1 to match the Input Region 1.498 + * @param status A reference to a UErrorCode to receive any errors. 1.499 + * @return TRUE if there is a match. 1.500 + * @stable ICU 3.0 1.501 + */ 1.502 +U_STABLE UBool U_EXPORT2 1.503 +uregex_lookingAt(URegularExpression *regexp, 1.504 + int32_t startIndex, 1.505 + UErrorCode *status); 1.506 + 1.507 +/** 1.508 + * 64bit version of uregex_lookingAt. 1.509 + * Attempts to match the input string, starting from the specified index, against the pattern. 1.510 + * The match may be of any length, and is not required to extend to the end 1.511 + * of the input string. Contrast with uregex_matches(). 1.512 + * 1.513 + * <p>If startIndex is >= 0 any input region that was set for this 1.514 + * URegularExpression is reset before the operation begins. 1.515 + * 1.516 + * <p>If the specified starting index == -1 the match begins at the start of the input 1.517 + * region, or at the start of the full string if no region has been specified. 1.518 + * This corresponds directly with Matcher.lookingAt() in Java. 1.519 + * 1.520 + * <p>If the match succeeds then more information can be obtained via the 1.521 + * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 1.522 + * and <code>uregexp_group()</code> functions.</p> 1.523 + * 1.524 + * @param regexp The compiled regular expression. 1.525 + * @param startIndex The input string (native) index at which to begin matching, or 1.526 + * -1 to match the Input Region 1.527 + * @param status A reference to a UErrorCode to receive any errors. 1.528 + * @return TRUE if there is a match. 1.529 + * @stable ICU 4.6 1.530 + */ 1.531 +U_STABLE UBool U_EXPORT2 1.532 +uregex_lookingAt64(URegularExpression *regexp, 1.533 + int64_t startIndex, 1.534 + UErrorCode *status); 1.535 + 1.536 +/** 1.537 + * Find the first matching substring of the input string that matches the pattern. 1.538 + * If startIndex is >= zero the search for a match begins at the specified index, 1.539 + * and any match region is reset. This corresponds directly with 1.540 + * Matcher.find(startIndex) in Java. 1.541 + * 1.542 + * If startIndex == -1 the search begins at the start of the input region, 1.543 + * or at the start of the full string if no region has been specified. 1.544 + * 1.545 + * If a match is found, <code>uregex_start(), uregex_end()</code>, and 1.546 + * <code>uregex_group()</code> will provide more information regarding the match. 1.547 + * 1.548 + * @param regexp The compiled regular expression. 1.549 + * @param startIndex The position (native) in the input string to begin the search, or 1.550 + * -1 to search within the Input Region. 1.551 + * @param status A reference to a UErrorCode to receive any errors. 1.552 + * @return TRUE if a match is found. 1.553 + * @stable ICU 3.0 1.554 + */ 1.555 +U_STABLE UBool U_EXPORT2 1.556 +uregex_find(URegularExpression *regexp, 1.557 + int32_t startIndex, 1.558 + UErrorCode *status); 1.559 + 1.560 +/** 1.561 + * 64bit version of uregex_find. 1.562 + * Find the first matching substring of the input string that matches the pattern. 1.563 + * If startIndex is >= zero the search for a match begins at the specified index, 1.564 + * and any match region is reset. This corresponds directly with 1.565 + * Matcher.find(startIndex) in Java. 1.566 + * 1.567 + * If startIndex == -1 the search begins at the start of the input region, 1.568 + * or at the start of the full string if no region has been specified. 1.569 + * 1.570 + * If a match is found, <code>uregex_start(), uregex_end()</code>, and 1.571 + * <code>uregex_group()</code> will provide more information regarding the match. 1.572 + * 1.573 + * @param regexp The compiled regular expression. 1.574 + * @param startIndex The position (native) in the input string to begin the search, or 1.575 + * -1 to search within the Input Region. 1.576 + * @param status A reference to a UErrorCode to receive any errors. 1.577 + * @return TRUE if a match is found. 1.578 + * @stable ICU 4.6 1.579 + */ 1.580 +U_STABLE UBool U_EXPORT2 1.581 +uregex_find64(URegularExpression *regexp, 1.582 + int64_t startIndex, 1.583 + UErrorCode *status); 1.584 + 1.585 +/** 1.586 + * Find the next pattern match in the input string. Begin searching 1.587 + * the input at the location following the end of he previous match, 1.588 + * or at the start of the string (or region) if there is no 1.589 + * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 1.590 + * <code>uregex_group()</code> will provide more information regarding the match. 1.591 + * 1.592 + * @param regexp The compiled regular expression. 1.593 + * @param status A reference to a UErrorCode to receive any errors. 1.594 + * @return TRUE if a match is found. 1.595 + * @see uregex_reset 1.596 + * @stable ICU 3.0 1.597 + */ 1.598 +U_STABLE UBool U_EXPORT2 1.599 +uregex_findNext(URegularExpression *regexp, 1.600 + UErrorCode *status); 1.601 + 1.602 +/** 1.603 + * Get the number of capturing groups in this regular expression's pattern. 1.604 + * @param regexp The compiled regular expression. 1.605 + * @param status A reference to a UErrorCode to receive any errors. 1.606 + * @return the number of capture groups 1.607 + * @stable ICU 3.0 1.608 + */ 1.609 +U_STABLE int32_t U_EXPORT2 1.610 +uregex_groupCount(URegularExpression *regexp, 1.611 + UErrorCode *status); 1.612 + 1.613 +/** Extract the string for the specified matching expression or subexpression. 1.614 + * Group #0 is the complete string of matched text. 1.615 + * Group #1 is the text matched by the first set of capturing parentheses. 1.616 + * 1.617 + * @param regexp The compiled regular expression. 1.618 + * @param groupNum The capture group to extract. Group 0 is the complete 1.619 + * match. The value of this parameter must be 1.620 + * less than or equal to the number of capture groups in 1.621 + * the pattern. 1.622 + * @param dest Buffer to receive the matching string data 1.623 + * @param destCapacity Capacity of the dest buffer. 1.624 + * @param status A reference to a UErrorCode to receive any errors. 1.625 + * @return Length of matching data, 1.626 + * or -1 if no applicable match. 1.627 + * @stable ICU 3.0 1.628 + */ 1.629 +U_STABLE int32_t U_EXPORT2 1.630 +uregex_group(URegularExpression *regexp, 1.631 + int32_t groupNum, 1.632 + UChar *dest, 1.633 + int32_t destCapacity, 1.634 + UErrorCode *status); 1.635 + 1.636 +/** Returns a shallow immutable clone of the entire input string. The returned UText current native index 1.637 + * is set to the beginning of the requested capture group. The capture group length is also 1.638 + * returned via groupLength. 1.639 + * Group #0 is the complete string of matched text. 1.640 + * Group #1 is the text matched by the first set of capturing parentheses. 1.641 + * 1.642 + * @param regexp The compiled regular expression. 1.643 + * @param groupNum The capture group to extract. Group 0 is the complete 1.644 + * match. The value of this parameter must be 1.645 + * less than or equal to the number of capture groups in 1.646 + * the pattern. 1.647 + * @param dest A mutable UText in which to store the current input. 1.648 + * If NULL, a new UText will be created as an immutable shallow clone 1.649 + * of the entire input string. 1.650 + * @param groupLength The group length of the desired capture group. 1.651 + * @param status A reference to a UErrorCode to receive any errors. 1.652 + * @return The subject text currently associated with this regular expression. 1.653 + * If a pre-allocated UText was provided, it will always be used and returned. 1.654 + 1.655 + * 1.656 + * @stable ICU 4.6 1.657 + */ 1.658 +U_STABLE UText * U_EXPORT2 1.659 +uregex_groupUText(URegularExpression *regexp, 1.660 + int32_t groupNum, 1.661 + UText *dest, 1.662 + int64_t *groupLength, 1.663 + UErrorCode *status); 1.664 + 1.665 +#ifndef U_HIDE_INTERNAL_API 1.666 +/** Extract the string for the specified matching expression or subexpression. 1.667 + * Group #0 is the complete string of matched text. 1.668 + * Group #1 is the text matched by the first set of capturing parentheses. 1.669 + * 1.670 + * @param regexp The compiled regular expression. 1.671 + * @param groupNum The capture group to extract. Group 0 is the complete 1.672 + * match. The value of this parameter must be 1.673 + * less than or equal to the number of capture groups in 1.674 + * the pattern. 1.675 + * @param dest Mutable UText to receive the matching string data. 1.676 + * If NULL, a new UText will be created (which may not be mutable). 1.677 + * @param status A reference to a UErrorCode to receive any errors. 1.678 + * @return The matching string data. If a pre-allocated UText was provided, 1.679 + * it will always be used and returned. 1.680 + * 1.681 + * @internal ICU 4.4 technology preview 1.682 + */ 1.683 +U_INTERNAL UText * U_EXPORT2 1.684 +uregex_groupUTextDeep(URegularExpression *regexp, 1.685 + int32_t groupNum, 1.686 + UText *dest, 1.687 + UErrorCode *status); 1.688 +#endif /* U_HIDE_INTERNAL_API */ 1.689 + 1.690 +/** 1.691 + * Returns the index in the input string of the start of the text matched by the 1.692 + * specified capture group during the previous match operation. Return -1 if 1.693 + * the capture group was not part of the last match. 1.694 + * Group #0 refers to the complete range of matched text. 1.695 + * Group #1 refers to the text matched by the first set of capturing parentheses. 1.696 + * 1.697 + * @param regexp The compiled regular expression. 1.698 + * @param groupNum The capture group number 1.699 + * @param status A reference to a UErrorCode to receive any errors. 1.700 + * @return the starting (native) position in the input of the text matched 1.701 + * by the specified group. 1.702 + * @stable ICU 3.0 1.703 + */ 1.704 +U_STABLE int32_t U_EXPORT2 1.705 +uregex_start(URegularExpression *regexp, 1.706 + int32_t groupNum, 1.707 + UErrorCode *status); 1.708 + 1.709 +/** 1.710 + * 64bit version of uregex_start. 1.711 + * Returns the index in the input string of the start of the text matched by the 1.712 + * specified capture group during the previous match operation. Return -1 if 1.713 + * the capture group was not part of the last match. 1.714 + * Group #0 refers to the complete range of matched text. 1.715 + * Group #1 refers to the text matched by the first set of capturing parentheses. 1.716 + * 1.717 + * @param regexp The compiled regular expression. 1.718 + * @param groupNum The capture group number 1.719 + * @param status A reference to a UErrorCode to receive any errors. 1.720 + * @return the starting (native) position in the input of the text matched 1.721 + * by the specified group. 1.722 + * @stable ICU 4.6 1.723 + */ 1.724 +U_STABLE int64_t U_EXPORT2 1.725 +uregex_start64(URegularExpression *regexp, 1.726 + int32_t groupNum, 1.727 + UErrorCode *status); 1.728 + 1.729 +/** 1.730 + * Returns the index in the input string of the position following the end 1.731 + * of the text matched by the specified capture group. 1.732 + * Return -1 if the capture group was not part of the last match. 1.733 + * Group #0 refers to the complete range of matched text. 1.734 + * Group #1 refers to the text matched by the first set of capturing parentheses. 1.735 + * 1.736 + * @param regexp The compiled regular expression. 1.737 + * @param groupNum The capture group number 1.738 + * @param status A reference to a UErrorCode to receive any errors. 1.739 + * @return the (native) index of the position following the last matched character. 1.740 + * @stable ICU 3.0 1.741 + */ 1.742 +U_STABLE int32_t U_EXPORT2 1.743 +uregex_end(URegularExpression *regexp, 1.744 + int32_t groupNum, 1.745 + UErrorCode *status); 1.746 + 1.747 +/** 1.748 + * 64bit version of uregex_end. 1.749 + * Returns the index in the input string of the position following the end 1.750 + * of the text matched by the specified capture group. 1.751 + * Return -1 if the capture group was not part of the last match. 1.752 + * Group #0 refers to the complete range of matched text. 1.753 + * Group #1 refers to the text matched by the first set of capturing parentheses. 1.754 + * 1.755 + * @param regexp The compiled regular expression. 1.756 + * @param groupNum The capture group number 1.757 + * @param status A reference to a UErrorCode to receive any errors. 1.758 + * @return the (native) index of the position following the last matched character. 1.759 + * @stable ICU 4.6 1.760 + */ 1.761 +U_STABLE int64_t U_EXPORT2 1.762 +uregex_end64(URegularExpression *regexp, 1.763 + int32_t groupNum, 1.764 + UErrorCode *status); 1.765 + 1.766 +/** 1.767 + * Reset any saved state from the previous match. Has the effect of 1.768 + * causing uregex_findNext to begin at the specified index, and causing 1.769 + * uregex_start(), uregex_end() and uregex_group() to return an error 1.770 + * indicating that there is no match information available. Clears any 1.771 + * match region that may have been set. 1.772 + * 1.773 + * @param regexp The compiled regular expression. 1.774 + * @param index The position (native) in the text at which a 1.775 + * uregex_findNext() should begin searching. 1.776 + * @param status A reference to a UErrorCode to receive any errors. 1.777 + * @stable ICU 3.0 1.778 + */ 1.779 +U_STABLE void U_EXPORT2 1.780 +uregex_reset(URegularExpression *regexp, 1.781 + int32_t index, 1.782 + UErrorCode *status); 1.783 + 1.784 +/** 1.785 + * 64bit version of uregex_reset. 1.786 + * Reset any saved state from the previous match. Has the effect of 1.787 + * causing uregex_findNext to begin at the specified index, and causing 1.788 + * uregex_start(), uregex_end() and uregex_group() to return an error 1.789 + * indicating that there is no match information available. Clears any 1.790 + * match region that may have been set. 1.791 + * 1.792 + * @param regexp The compiled regular expression. 1.793 + * @param index The position (native) in the text at which a 1.794 + * uregex_findNext() should begin searching. 1.795 + * @param status A reference to a UErrorCode to receive any errors. 1.796 + * @stable ICU 4.6 1.797 + */ 1.798 +U_STABLE void U_EXPORT2 1.799 +uregex_reset64(URegularExpression *regexp, 1.800 + int64_t index, 1.801 + UErrorCode *status); 1.802 + 1.803 +/** 1.804 + * Sets the limits of the matching region for this URegularExpression. 1.805 + * The region is the part of the input string that will be considered when matching. 1.806 + * Invoking this method resets any saved state from the previous match, 1.807 + * then sets the region to start at the index specified by the start parameter 1.808 + * and end at the index specified by the end parameter. 1.809 + * 1.810 + * Depending on the transparency and anchoring being used (see useTransparentBounds 1.811 + * and useAnchoringBounds), certain constructs such as anchors may behave differently 1.812 + * at or around the boundaries of the region 1.813 + * 1.814 + * The function will fail if start is greater than limit, or if either index 1.815 + * is less than zero or greater than the length of the string being matched. 1.816 + * 1.817 + * @param regexp The compiled regular expression. 1.818 + * @param regionStart The (native) index to begin searches at. 1.819 + * @param regionLimit The (native) index to end searches at (exclusive). 1.820 + * @param status A pointer to a UErrorCode to receive any errors. 1.821 + * @stable ICU 4.0 1.822 + */ 1.823 +U_STABLE void U_EXPORT2 1.824 +uregex_setRegion(URegularExpression *regexp, 1.825 + int32_t regionStart, 1.826 + int32_t regionLimit, 1.827 + UErrorCode *status); 1.828 + 1.829 +/** 1.830 + * 64bit version of uregex_setRegion. 1.831 + * Sets the limits of the matching region for this URegularExpression. 1.832 + * The region is the part of the input string that will be considered when matching. 1.833 + * Invoking this method resets any saved state from the previous match, 1.834 + * then sets the region to start at the index specified by the start parameter 1.835 + * and end at the index specified by the end parameter. 1.836 + * 1.837 + * Depending on the transparency and anchoring being used (see useTransparentBounds 1.838 + * and useAnchoringBounds), certain constructs such as anchors may behave differently 1.839 + * at or around the boundaries of the region 1.840 + * 1.841 + * The function will fail if start is greater than limit, or if either index 1.842 + * is less than zero or greater than the length of the string being matched. 1.843 + * 1.844 + * @param regexp The compiled regular expression. 1.845 + * @param regionStart The (native) index to begin searches at. 1.846 + * @param regionLimit The (native) index to end searches at (exclusive). 1.847 + * @param status A pointer to a UErrorCode to receive any errors. 1.848 + * @stable ICU 4.6 1.849 + */ 1.850 +U_STABLE void U_EXPORT2 1.851 +uregex_setRegion64(URegularExpression *regexp, 1.852 + int64_t regionStart, 1.853 + int64_t regionLimit, 1.854 + UErrorCode *status); 1.855 + 1.856 +/** 1.857 + * Set the matching region and the starting index for subsequent matches 1.858 + * in a single operation. 1.859 + * This is useful because the usual function for setting the starting 1.860 + * index, urgex_reset(), also resets any region limits. 1.861 + * 1.862 + * @param regexp The compiled regular expression. 1.863 + * @param regionStart The (native) index to begin searches at. 1.864 + * @param regionLimit The (native) index to end searches at (exclusive). 1.865 + * @param startIndex The index in the input text at which the next 1.866 + * match operation should begin. 1.867 + * @param status A pointer to a UErrorCode to receive any errors. 1.868 + * @stable ICU 4.6 1.869 + */ 1.870 +U_STABLE void U_EXPORT2 1.871 +uregex_setRegionAndStart(URegularExpression *regexp, 1.872 + int64_t regionStart, 1.873 + int64_t regionLimit, 1.874 + int64_t startIndex, 1.875 + UErrorCode *status); 1.876 + 1.877 +/** 1.878 + * Reports the start index of the matching region. Any matches found are limited to 1.879 + * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 1.880 + * 1.881 + * @param regexp The compiled regular expression. 1.882 + * @param status A pointer to a UErrorCode to receive any errors. 1.883 + * @return The starting (native) index of this matcher's region. 1.884 + * @stable ICU 4.0 1.885 + */ 1.886 +U_STABLE int32_t U_EXPORT2 1.887 +uregex_regionStart(const URegularExpression *regexp, 1.888 + UErrorCode *status); 1.889 + 1.890 +/** 1.891 + * 64bit version of uregex_regionStart. 1.892 + * Reports the start index of the matching region. Any matches found are limited to 1.893 + * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 1.894 + * 1.895 + * @param regexp The compiled regular expression. 1.896 + * @param status A pointer to a UErrorCode to receive any errors. 1.897 + * @return The starting (native) index of this matcher's region. 1.898 + * @stable ICU 4.6 1.899 + */ 1.900 +U_STABLE int64_t U_EXPORT2 1.901 +uregex_regionStart64(const URegularExpression *regexp, 1.902 + UErrorCode *status); 1.903 + 1.904 +/** 1.905 + * Reports the end index (exclusive) of the matching region for this URegularExpression. 1.906 + * Any matches found are limited to to the region bounded by regionStart (inclusive) 1.907 + * and regionEnd (exclusive). 1.908 + * 1.909 + * @param regexp The compiled regular expression. 1.910 + * @param status A pointer to a UErrorCode to receive any errors. 1.911 + * @return The ending point (native) of this matcher's region. 1.912 + * @stable ICU 4.0 1.913 + */ 1.914 +U_STABLE int32_t U_EXPORT2 1.915 +uregex_regionEnd(const URegularExpression *regexp, 1.916 + UErrorCode *status); 1.917 + 1.918 +/** 1.919 + * 64bit version of uregex_regionEnd. 1.920 + * Reports the end index (exclusive) of the matching region for this URegularExpression. 1.921 + * Any matches found are limited to to the region bounded by regionStart (inclusive) 1.922 + * and regionEnd (exclusive). 1.923 + * 1.924 + * @param regexp The compiled regular expression. 1.925 + * @param status A pointer to a UErrorCode to receive any errors. 1.926 + * @return The ending point (native) of this matcher's region. 1.927 + * @stable ICU 4.6 1.928 + */ 1.929 +U_STABLE int64_t U_EXPORT2 1.930 +uregex_regionEnd64(const URegularExpression *regexp, 1.931 + UErrorCode *status); 1.932 + 1.933 +/** 1.934 + * Queries the transparency of region bounds for this URegularExpression. 1.935 + * See useTransparentBounds for a description of transparent and opaque bounds. 1.936 + * By default, matching boundaries are opaque. 1.937 + * 1.938 + * @param regexp The compiled regular expression. 1.939 + * @param status A pointer to a UErrorCode to receive any errors. 1.940 + * @return TRUE if this matcher is using opaque bounds, false if it is not. 1.941 + * @stable ICU 4.0 1.942 + */ 1.943 +U_STABLE UBool U_EXPORT2 1.944 +uregex_hasTransparentBounds(const URegularExpression *regexp, 1.945 + UErrorCode *status); 1.946 + 1.947 + 1.948 +/** 1.949 + * Sets the transparency of region bounds for this URegularExpression. 1.950 + * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 1.951 + * If the boolean argument is FALSE, then opaque bounds will be used. 1.952 + * 1.953 + * Using transparent bounds, the boundaries of the matching region are transparent 1.954 + * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 1.955 + * see text beyond the boundaries of the region while checking for a match. 1.956 + * 1.957 + * With opaque bounds, no text outside of the matching region is visible to lookahead, 1.958 + * lookbehind, and boundary matching constructs. 1.959 + * 1.960 + * By default, opaque bounds are used. 1.961 + * 1.962 + * @param regexp The compiled regular expression. 1.963 + * @param b TRUE for transparent bounds; FALSE for opaque bounds 1.964 + * @param status A pointer to a UErrorCode to receive any errors. 1.965 + * @stable ICU 4.0 1.966 + **/ 1.967 +U_STABLE void U_EXPORT2 1.968 +uregex_useTransparentBounds(URegularExpression *regexp, 1.969 + UBool b, 1.970 + UErrorCode *status); 1.971 + 1.972 + 1.973 +/** 1.974 + * Return true if this URegularExpression is using anchoring bounds. 1.975 + * By default, anchoring region bounds are used. 1.976 + * 1.977 + * @param regexp The compiled regular expression. 1.978 + * @param status A pointer to a UErrorCode to receive any errors. 1.979 + * @return TRUE if this matcher is using anchoring bounds. 1.980 + * @stable ICU 4.0 1.981 + */ 1.982 +U_STABLE UBool U_EXPORT2 1.983 +uregex_hasAnchoringBounds(const URegularExpression *regexp, 1.984 + UErrorCode *status); 1.985 + 1.986 + 1.987 +/** 1.988 + * Set whether this URegularExpression is using Anchoring Bounds for its region. 1.989 + * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 1.990 + * and end of the region. Without Anchoring Bounds, anchors will only match at 1.991 + * the positions they would in the complete text. 1.992 + * 1.993 + * Anchoring Bounds are the default for regions. 1.994 + * 1.995 + * @param regexp The compiled regular expression. 1.996 + * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 1.997 + * @param status A pointer to a UErrorCode to receive any errors. 1.998 + * @stable ICU 4.0 1.999 + */ 1.1000 +U_STABLE void U_EXPORT2 1.1001 +uregex_useAnchoringBounds(URegularExpression *regexp, 1.1002 + UBool b, 1.1003 + UErrorCode *status); 1.1004 + 1.1005 +/** 1.1006 + * Return TRUE if the most recent matching operation touched the 1.1007 + * end of the text being processed. In this case, additional input text could 1.1008 + * change the results of that match. 1.1009 + * 1.1010 + * @param regexp The compiled regular expression. 1.1011 + * @param status A pointer to a UErrorCode to receive any errors. 1.1012 + * @return TRUE if the most recent match hit the end of input 1.1013 + * @stable ICU 4.0 1.1014 + */ 1.1015 +U_STABLE UBool U_EXPORT2 1.1016 +uregex_hitEnd(const URegularExpression *regexp, 1.1017 + UErrorCode *status); 1.1018 + 1.1019 +/** 1.1020 + * Return TRUE the most recent match succeeded and additional input could cause 1.1021 + * it to fail. If this function returns false and a match was found, then more input 1.1022 + * might change the match but the match won't be lost. If a match was not found, 1.1023 + * then requireEnd has no meaning. 1.1024 + * 1.1025 + * @param regexp The compiled regular expression. 1.1026 + * @param status A pointer to a UErrorCode to receive any errors. 1.1027 + * @return TRUE if more input could cause the most recent match to no longer match. 1.1028 + * @stable ICU 4.0 1.1029 + */ 1.1030 +U_STABLE UBool U_EXPORT2 1.1031 +uregex_requireEnd(const URegularExpression *regexp, 1.1032 + UErrorCode *status); 1.1033 + 1.1034 + 1.1035 + 1.1036 + 1.1037 + 1.1038 +/** 1.1039 + * Replaces every substring of the input that matches the pattern 1.1040 + * with the given replacement string. This is a convenience function that 1.1041 + * provides a complete find-and-replace-all operation. 1.1042 + * 1.1043 + * This method scans the input string looking for matches of the pattern. 1.1044 + * Input that is not part of any match is copied unchanged to the 1.1045 + * destination buffer. Matched regions are replaced in the output 1.1046 + * buffer by the replacement string. The replacement string may contain 1.1047 + * references to capture groups; these take the form of $1, $2, etc. 1.1048 + * 1.1049 + * @param regexp The compiled regular expression. 1.1050 + * @param replacementText A string containing the replacement text. 1.1051 + * @param replacementLength The length of the replacement string, or 1.1052 + * -1 if it is NUL terminated. 1.1053 + * @param destBuf A (UChar *) buffer that will receive the result. 1.1054 + * @param destCapacity The capacity of the destination buffer. 1.1055 + * @param status A reference to a UErrorCode to receive any errors. 1.1056 + * @return The length of the string resulting from the find 1.1057 + * and replace operation. In the event that the 1.1058 + * destination capacity is inadequate, the return value 1.1059 + * is still the full length of the untruncated string. 1.1060 + * @stable ICU 3.0 1.1061 + */ 1.1062 +U_STABLE int32_t U_EXPORT2 1.1063 +uregex_replaceAll(URegularExpression *regexp, 1.1064 + const UChar *replacementText, 1.1065 + int32_t replacementLength, 1.1066 + UChar *destBuf, 1.1067 + int32_t destCapacity, 1.1068 + UErrorCode *status); 1.1069 + 1.1070 +/** 1.1071 + * Replaces every substring of the input that matches the pattern 1.1072 + * with the given replacement string. This is a convenience function that 1.1073 + * provides a complete find-and-replace-all operation. 1.1074 + * 1.1075 + * This method scans the input string looking for matches of the pattern. 1.1076 + * Input that is not part of any match is copied unchanged to the 1.1077 + * destination buffer. Matched regions are replaced in the output 1.1078 + * buffer by the replacement string. The replacement string may contain 1.1079 + * references to capture groups; these take the form of $1, $2, etc. 1.1080 + * 1.1081 + * @param regexp The compiled regular expression. 1.1082 + * @param replacement A string containing the replacement text. 1.1083 + * @param dest A mutable UText that will receive the result. 1.1084 + * If NULL, a new UText will be created (which may not be mutable). 1.1085 + * @param status A reference to a UErrorCode to receive any errors. 1.1086 + * @return A UText containing the results of the find and replace. 1.1087 + * If a pre-allocated UText was provided, it will always be used and returned. 1.1088 + * 1.1089 + * @stable ICU 4.6 1.1090 + */ 1.1091 +U_STABLE UText * U_EXPORT2 1.1092 +uregex_replaceAllUText(URegularExpression *regexp, 1.1093 + UText *replacement, 1.1094 + UText *dest, 1.1095 + UErrorCode *status); 1.1096 + 1.1097 +/** 1.1098 + * Replaces the first substring of the input that matches the pattern 1.1099 + * with the given replacement string. This is a convenience function that 1.1100 + * provides a complete find-and-replace operation. 1.1101 + * 1.1102 + * This method scans the input string looking for a match of the pattern. 1.1103 + * All input that is not part of the match is copied unchanged to the 1.1104 + * destination buffer. The matched region is replaced in the output 1.1105 + * buffer by the replacement string. The replacement string may contain 1.1106 + * references to capture groups; these take the form of $1, $2, etc. 1.1107 + * 1.1108 + * @param regexp The compiled regular expression. 1.1109 + * @param replacementText A string containing the replacement text. 1.1110 + * @param replacementLength The length of the replacement string, or 1.1111 + * -1 if it is NUL terminated. 1.1112 + * @param destBuf A (UChar *) buffer that will receive the result. 1.1113 + * @param destCapacity The capacity of the destination buffer. 1.1114 + * @param status a reference to a UErrorCode to receive any errors. 1.1115 + * @return The length of the string resulting from the find 1.1116 + * and replace operation. In the event that the 1.1117 + * destination capacity is inadequate, the return value 1.1118 + * is still the full length of the untruncated string. 1.1119 + * @stable ICU 3.0 1.1120 + */ 1.1121 +U_STABLE int32_t U_EXPORT2 1.1122 +uregex_replaceFirst(URegularExpression *regexp, 1.1123 + const UChar *replacementText, 1.1124 + int32_t replacementLength, 1.1125 + UChar *destBuf, 1.1126 + int32_t destCapacity, 1.1127 + UErrorCode *status); 1.1128 + 1.1129 +/** 1.1130 + * Replaces the first substring of the input that matches the pattern 1.1131 + * with the given replacement string. This is a convenience function that 1.1132 + * provides a complete find-and-replace operation. 1.1133 + * 1.1134 + * This method scans the input string looking for a match of the pattern. 1.1135 + * All input that is not part of the match is copied unchanged to the 1.1136 + * destination buffer. The matched region is replaced in the output 1.1137 + * buffer by the replacement string. The replacement string may contain 1.1138 + * references to capture groups; these take the form of $1, $2, etc. 1.1139 + * 1.1140 + * @param regexp The compiled regular expression. 1.1141 + * @param replacement A string containing the replacement text. 1.1142 + * @param dest A mutable UText that will receive the result. 1.1143 + * If NULL, a new UText will be created (which may not be mutable). 1.1144 + * @param status A reference to a UErrorCode to receive any errors. 1.1145 + * @return A UText containing the results of the find and replace. 1.1146 + * If a pre-allocated UText was provided, it will always be used and returned. 1.1147 + * 1.1148 + * @stable ICU 4.6 1.1149 + */ 1.1150 +U_STABLE UText * U_EXPORT2 1.1151 +uregex_replaceFirstUText(URegularExpression *regexp, 1.1152 + UText *replacement, 1.1153 + UText *dest, 1.1154 + UErrorCode *status); 1.1155 + 1.1156 +/** 1.1157 + * Implements a replace operation intended to be used as part of an 1.1158 + * incremental find-and-replace. 1.1159 + * 1.1160 + * <p>The input string, starting from the end of the previous match and ending at 1.1161 + * the start of the current match, is appended to the destination string. Then the 1.1162 + * replacement string is appended to the output string, 1.1163 + * including handling any substitutions of captured text.</p> 1.1164 + * 1.1165 + * <p>A note on preflight computation of buffersize and error handling: 1.1166 + * Calls to uregex_appendReplacement() and uregex_appendTail() are 1.1167 + * designed to be chained, one after another, with the destination 1.1168 + * buffer pointer and buffer capacity updated after each in preparation 1.1169 + * to for the next. If the destination buffer is exhausted partway through such a 1.1170 + * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1.1171 + * ICU conventions are for a function to perform no action if it is 1.1172 + * called with an error status, but for this one case, uregex_appendRepacement() 1.1173 + * will operate normally so that buffer size computations will complete 1.1174 + * correctly. 1.1175 + * 1.1176 + * <p>For simple, prepackaged, non-incremental find-and-replace 1.1177 + * operations, see replaceFirst() or replaceAll().</p> 1.1178 + * 1.1179 + * @param regexp The regular expression object. 1.1180 + * @param replacementText The string that will replace the matched portion of the 1.1181 + * input string as it is copied to the destination buffer. 1.1182 + * The replacement text may contain references ($1, for 1.1183 + * example) to capture groups from the match. 1.1184 + * @param replacementLength The length of the replacement text string, 1.1185 + * or -1 if the string is NUL terminated. 1.1186 + * @param destBuf The buffer into which the results of the 1.1187 + * find-and-replace are placed. On return, this pointer 1.1188 + * will be updated to refer to the beginning of the 1.1189 + * unused portion of buffer, leaving it in position for 1.1190 + * a subsequent call to this function. 1.1191 + * @param destCapacity The size of the output buffer, On return, this 1.1192 + * parameter will be updated to reflect the space remaining 1.1193 + * unused in the output buffer. 1.1194 + * @param status A reference to a UErrorCode to receive any errors. 1.1195 + * @return The length of the result string. In the event that 1.1196 + * destCapacity is inadequate, the full length of the 1.1197 + * untruncated output string is returned. 1.1198 + * 1.1199 + * @stable ICU 3.0 1.1200 + * 1.1201 + */ 1.1202 +U_STABLE int32_t U_EXPORT2 1.1203 +uregex_appendReplacement(URegularExpression *regexp, 1.1204 + const UChar *replacementText, 1.1205 + int32_t replacementLength, 1.1206 + UChar **destBuf, 1.1207 + int32_t *destCapacity, 1.1208 + UErrorCode *status); 1.1209 + 1.1210 +/** 1.1211 + * Implements a replace operation intended to be used as part of an 1.1212 + * incremental find-and-replace. 1.1213 + * 1.1214 + * <p>The input string, starting from the end of the previous match and ending at 1.1215 + * the start of the current match, is appended to the destination string. Then the 1.1216 + * replacement string is appended to the output string, 1.1217 + * including handling any substitutions of captured text.</p> 1.1218 + * 1.1219 + * <p>For simple, prepackaged, non-incremental find-and-replace 1.1220 + * operations, see replaceFirst() or replaceAll().</p> 1.1221 + * 1.1222 + * @param regexp The regular expression object. 1.1223 + * @param replacementText The string that will replace the matched portion of the 1.1224 + * input string as it is copied to the destination buffer. 1.1225 + * The replacement text may contain references ($1, for 1.1226 + * example) to capture groups from the match. 1.1227 + * @param dest A mutable UText that will receive the result. Must not be NULL. 1.1228 + * @param status A reference to a UErrorCode to receive any errors. 1.1229 + * 1.1230 + * @stable ICU 4.6 1.1231 + */ 1.1232 +U_STABLE void U_EXPORT2 1.1233 +uregex_appendReplacementUText(URegularExpression *regexp, 1.1234 + UText *replacementText, 1.1235 + UText *dest, 1.1236 + UErrorCode *status); 1.1237 + 1.1238 +/** 1.1239 + * As the final step in a find-and-replace operation, append the remainder 1.1240 + * of the input string, starting at the position following the last match, 1.1241 + * to the destination string. <code>uregex_appendTail()</code> is intended 1.1242 + * to be invoked after one or more invocations of the 1.1243 + * <code>uregex_appendReplacement()</code> function. 1.1244 + * 1.1245 + * @param regexp The regular expression object. This is needed to 1.1246 + * obtain the input string and with the position 1.1247 + * of the last match within it. 1.1248 + * @param destBuf The buffer in which the results of the 1.1249 + * find-and-replace are placed. On return, the pointer 1.1250 + * will be updated to refer to the beginning of the 1.1251 + * unused portion of buffer. 1.1252 + * @param destCapacity The size of the output buffer, On return, this 1.1253 + * value will be updated to reflect the space remaining 1.1254 + * unused in the output buffer. 1.1255 + * @param status A reference to a UErrorCode to receive any errors. 1.1256 + * @return The length of the result string. In the event that 1.1257 + * destCapacity is inadequate, the full length of the 1.1258 + * untruncated output string is returned. 1.1259 + * 1.1260 + * @stable ICU 3.0 1.1261 + */ 1.1262 +U_STABLE int32_t U_EXPORT2 1.1263 +uregex_appendTail(URegularExpression *regexp, 1.1264 + UChar **destBuf, 1.1265 + int32_t *destCapacity, 1.1266 + UErrorCode *status); 1.1267 + 1.1268 +/** 1.1269 + * As the final step in a find-and-replace operation, append the remainder 1.1270 + * of the input string, starting at the position following the last match, 1.1271 + * to the destination string. <code>uregex_appendTailUText()</code> is intended 1.1272 + * to be invoked after one or more invocations of the 1.1273 + * <code>uregex_appendReplacementUText()</code> function. 1.1274 + * 1.1275 + * @param regexp The regular expression object. This is needed to 1.1276 + * obtain the input string and with the position 1.1277 + * of the last match within it. 1.1278 + * @param dest A mutable UText that will receive the result. Must not be NULL. 1.1279 + * 1.1280 + * @param status Error code 1.1281 + * 1.1282 + * @return The destination UText. 1.1283 + * 1.1284 + * @stable ICU 4.6 1.1285 + */ 1.1286 +U_STABLE UText * U_EXPORT2 1.1287 +uregex_appendTailUText(URegularExpression *regexp, 1.1288 + UText *dest, 1.1289 + UErrorCode *status); 1.1290 + 1.1291 + /** 1.1292 + * Split a string into fields. Somewhat like split() from Perl. 1.1293 + * The pattern matches identify delimiters that separate the input 1.1294 + * into fields. The input data between the matches becomes the 1.1295 + * fields themselves. 1.1296 + * 1.1297 + * Each of the fields is copied from the input string to the destination 1.1298 + * buffer, and NUL terminated. The position of each field within 1.1299 + * the destination buffer is returned in the destFields array. 1.1300 + * 1.1301 + * If the delimiter pattern includes capture groups, the captured text will 1.1302 + * also appear in the destination array of output strings, interspersed 1.1303 + * with the fields. This is similar to Perl, but differs from Java, 1.1304 + * which ignores the presence of capture groups in the pattern. 1.1305 + * 1.1306 + * Trailing empty fields will always be returned, assuming sufficient 1.1307 + * destination capacity. This differs from the default behavior for Java 1.1308 + * and Perl where trailing empty fields are not returned. 1.1309 + * 1.1310 + * The number of strings produced by the split operation is returned. 1.1311 + * This count includes the strings from capture groups in the delimiter pattern. 1.1312 + * This behavior differs from Java, which ignores capture groups. 1.1313 + * 1.1314 + * @param regexp The compiled regular expression. 1.1315 + * @param destBuf A (UChar *) buffer to receive the fields that 1.1316 + * are extracted from the input string. These 1.1317 + * field pointers will refer to positions within the 1.1318 + * destination buffer supplied by the caller. Any 1.1319 + * extra positions within the destFields array will be 1.1320 + * set to NULL. 1.1321 + * @param destCapacity The capacity of the destBuf. 1.1322 + * @param requiredCapacity The actual capacity required of the destBuf. 1.1323 + * If destCapacity is too small, requiredCapacity will return 1.1324 + * the total capacity required to hold all of the output, and 1.1325 + * a U_BUFFER_OVERFLOW_ERROR will be returned. 1.1326 + * @param destFields An array to be filled with the position of each 1.1327 + * of the extracted fields within destBuf. 1.1328 + * @param destFieldsCapacity The number of elements in the destFields array. 1.1329 + * If the number of fields found is less than destFieldsCapacity, 1.1330 + * the extra destFields elements are set to zero. 1.1331 + * If destFieldsCapacity is too small, the trailing part of the 1.1332 + * input, including any field delimiters, is treated as if it 1.1333 + * were the last field - it is copied to the destBuf, and 1.1334 + * its position is in the destBuf is stored in the last element 1.1335 + * of destFields. This behavior mimics that of Perl. It is not 1.1336 + * an error condition, and no error status is returned when all destField 1.1337 + * positions are used. 1.1338 + * @param status A reference to a UErrorCode to receive any errors. 1.1339 + * @return The number of fields into which the input string was split. 1.1340 + * @stable ICU 3.0 1.1341 + */ 1.1342 +U_STABLE int32_t U_EXPORT2 1.1343 +uregex_split( URegularExpression *regexp, 1.1344 + UChar *destBuf, 1.1345 + int32_t destCapacity, 1.1346 + int32_t *requiredCapacity, 1.1347 + UChar *destFields[], 1.1348 + int32_t destFieldsCapacity, 1.1349 + UErrorCode *status); 1.1350 + 1.1351 + /** 1.1352 + * Split a string into fields. Somewhat like split() from Perl. 1.1353 + * The pattern matches identify delimiters that separate the input 1.1354 + * into fields. The input data between the matches becomes the 1.1355 + * fields themselves. 1.1356 + * <p> 1.1357 + * The behavior of this function is not very closely aligned with uregex_split(); 1.1358 + * instead, it is based on (and implemented directly on top of) the C++ split method. 1.1359 + * 1.1360 + * @param regexp The compiled regular expression. 1.1361 + * @param destFields An array of mutable UText structs to receive the results of the split. 1.1362 + * If a field is NULL, a new UText is allocated to contain the results for 1.1363 + * that field. This new UText is not guaranteed to be mutable. 1.1364 + * @param destFieldsCapacity The number of elements in the destination array. 1.1365 + * If the number of fields found is less than destCapacity, the 1.1366 + * extra strings in the destination array are not altered. 1.1367 + * If the number of destination strings is less than the number 1.1368 + * of fields, the trailing part of the input string, including any 1.1369 + * field delimiters, is placed in the last destination string. 1.1370 + * This behavior mimics that of Perl. It is not an error condition, and no 1.1371 + * error status is returned when all destField positions are used. 1.1372 + * @param status A reference to a UErrorCode to receive any errors. 1.1373 + * @return The number of fields into which the input string was split. 1.1374 + * 1.1375 + * @stable ICU 4.6 1.1376 + */ 1.1377 +U_STABLE int32_t U_EXPORT2 1.1378 +uregex_splitUText(URegularExpression *regexp, 1.1379 + UText *destFields[], 1.1380 + int32_t destFieldsCapacity, 1.1381 + UErrorCode *status); 1.1382 + 1.1383 +/** 1.1384 + * Set a processing time limit for match operations with this URegularExpression. 1.1385 + * 1.1386 + * Some patterns, when matching certain strings, can run in exponential time. 1.1387 + * For practical purposes, the match operation may appear to be in an 1.1388 + * infinite loop. 1.1389 + * When a limit is set a match operation will fail with an error if the 1.1390 + * limit is exceeded. 1.1391 + * <p> 1.1392 + * The units of the limit are steps of the match engine. 1.1393 + * Correspondence with actual processor time will depend on the speed 1.1394 + * of the processor and the details of the specific pattern, but will 1.1395 + * typically be on the order of milliseconds. 1.1396 + * <p> 1.1397 + * By default, the matching time is not limited. 1.1398 + * <p> 1.1399 + * 1.1400 + * @param regexp The compiled regular expression. 1.1401 + * @param limit The limit value, or 0 for no limit. 1.1402 + * @param status A reference to a UErrorCode to receive any errors. 1.1403 + * @stable ICU 4.0 1.1404 + */ 1.1405 +U_STABLE void U_EXPORT2 1.1406 +uregex_setTimeLimit(URegularExpression *regexp, 1.1407 + int32_t limit, 1.1408 + UErrorCode *status); 1.1409 + 1.1410 +/** 1.1411 + * Get the time limit for for matches with this URegularExpression. 1.1412 + * A return value of zero indicates that there is no limit. 1.1413 + * 1.1414 + * @param regexp The compiled regular expression. 1.1415 + * @param status A reference to a UErrorCode to receive any errors. 1.1416 + * @return the maximum allowed time for a match, in units of processing steps. 1.1417 + * @stable ICU 4.0 1.1418 + */ 1.1419 +U_STABLE int32_t U_EXPORT2 1.1420 +uregex_getTimeLimit(const URegularExpression *regexp, 1.1421 + UErrorCode *status); 1.1422 + 1.1423 +/** 1.1424 + * Set the amount of heap storage available for use by the match backtracking stack. 1.1425 + * <p> 1.1426 + * ICU uses a backtracking regular expression engine, with the backtrack stack 1.1427 + * maintained on the heap. This function sets the limit to the amount of memory 1.1428 + * that can be used for this purpose. A backtracking stack overflow will 1.1429 + * result in an error from the match operation that caused it. 1.1430 + * <p> 1.1431 + * A limit is desirable because a malicious or poorly designed pattern can use 1.1432 + * excessive memory, potentially crashing the process. A limit is enabled 1.1433 + * by default. 1.1434 + * <p> 1.1435 + * @param regexp The compiled regular expression. 1.1436 + * @param limit The maximum size, in bytes, of the matching backtrack stack. 1.1437 + * A value of zero means no limit. 1.1438 + * The limit must be greater than or equal to zero. 1.1439 + * @param status A reference to a UErrorCode to receive any errors. 1.1440 + * 1.1441 + * @stable ICU 4.0 1.1442 + */ 1.1443 +U_STABLE void U_EXPORT2 1.1444 +uregex_setStackLimit(URegularExpression *regexp, 1.1445 + int32_t limit, 1.1446 + UErrorCode *status); 1.1447 + 1.1448 +/** 1.1449 + * Get the size of the heap storage available for use by the back tracking stack. 1.1450 + * 1.1451 + * @return the maximum backtracking stack size, in bytes, or zero if the 1.1452 + * stack size is unlimited. 1.1453 + * @stable ICU 4.0 1.1454 + */ 1.1455 +U_STABLE int32_t U_EXPORT2 1.1456 +uregex_getStackLimit(const URegularExpression *regexp, 1.1457 + UErrorCode *status); 1.1458 + 1.1459 + 1.1460 +/** 1.1461 + * Function pointer for a regular expression matching callback function. 1.1462 + * When set, a callback function will be called periodically during matching 1.1463 + * operations. If the call back function returns FALSE, the matching 1.1464 + * operation will be terminated early. 1.1465 + * 1.1466 + * Note: the callback function must not call other functions on this 1.1467 + * URegularExpression. 1.1468 + * 1.1469 + * @param context context pointer. The callback function will be invoked 1.1470 + * with the context specified at the time that 1.1471 + * uregex_setMatchCallback() is called. 1.1472 + * @param steps the accumulated processing time, in match steps, 1.1473 + * for this matching operation. 1.1474 + * @return TRUE to continue the matching operation. 1.1475 + * FALSE to terminate the matching operation. 1.1476 + * @stable ICU 4.0 1.1477 + */ 1.1478 +U_CDECL_BEGIN 1.1479 +typedef UBool U_CALLCONV URegexMatchCallback ( 1.1480 + const void *context, 1.1481 + int32_t steps); 1.1482 +U_CDECL_END 1.1483 + 1.1484 +/** 1.1485 + * Set a callback function for this URegularExpression. 1.1486 + * During matching operations the function will be called periodically, 1.1487 + * giving the application the opportunity to terminate a long-running 1.1488 + * match. 1.1489 + * 1.1490 + * @param regexp The compiled regular expression. 1.1491 + * @param callback A pointer to the user-supplied callback function. 1.1492 + * @param context User context pointer. The value supplied at the 1.1493 + * time the callback function is set will be saved 1.1494 + * and passed to the callback each time that it is called. 1.1495 + * @param status A reference to a UErrorCode to receive any errors. 1.1496 + * @stable ICU 4.0 1.1497 + */ 1.1498 +U_STABLE void U_EXPORT2 1.1499 +uregex_setMatchCallback(URegularExpression *regexp, 1.1500 + URegexMatchCallback *callback, 1.1501 + const void *context, 1.1502 + UErrorCode *status); 1.1503 + 1.1504 + 1.1505 +/** 1.1506 + * Get the callback function for this URegularExpression. 1.1507 + * 1.1508 + * @param regexp The compiled regular expression. 1.1509 + * @param callback Out parameter, receives a pointer to the user-supplied 1.1510 + * callback function. 1.1511 + * @param context Out parameter, receives the user context pointer that 1.1512 + * was set when uregex_setMatchCallback() was called. 1.1513 + * @param status A reference to a UErrorCode to receive any errors. 1.1514 + * @stable ICU 4.0 1.1515 + */ 1.1516 +U_STABLE void U_EXPORT2 1.1517 +uregex_getMatchCallback(const URegularExpression *regexp, 1.1518 + URegexMatchCallback **callback, 1.1519 + const void **context, 1.1520 + UErrorCode *status); 1.1521 + 1.1522 +/** 1.1523 + * Function pointer for a regular expression find callback function. 1.1524 + * 1.1525 + * When set, a callback function will be called during a find operation 1.1526 + * and for operations that depend on find, such as findNext, split and some replace 1.1527 + * operations like replaceFirst. 1.1528 + * The callback will usually be called after each attempt at a match, but this is not a 1.1529 + * guarantee that the callback will be invoked at each character. For finds where the 1.1530 + * match engine is invoked at each character, this may be close to true, but less likely 1.1531 + * for more optimized loops where the pattern is known to only start, and the match 1.1532 + * engine invoked, at certain characters. 1.1533 + * When invoked, this callback will specify the index at which a match operation is about 1.1534 + * to be attempted, giving the application the opportunity to terminate a long-running 1.1535 + * find operation. 1.1536 + * 1.1537 + * If the call back function returns FALSE, the find operation will be terminated early. 1.1538 + * 1.1539 + * Note: the callback function must not call other functions on this 1.1540 + * URegularExpression 1.1541 + * 1.1542 + * @param context context pointer. The callback function will be invoked 1.1543 + * with the context specified at the time that 1.1544 + * uregex_setFindProgressCallback() is called. 1.1545 + * @param matchIndex the next index at which a match attempt will be attempted for this 1.1546 + * find operation. If this callback interrupts the search, this is the 1.1547 + * index at which a find/findNext operation may be re-initiated. 1.1548 + * @return TRUE to continue the matching operation. 1.1549 + * FALSE to terminate the matching operation. 1.1550 + * @stable ICU 4.6 1.1551 + */ 1.1552 +U_CDECL_BEGIN 1.1553 +typedef UBool U_CALLCONV URegexFindProgressCallback ( 1.1554 + const void *context, 1.1555 + int64_t matchIndex); 1.1556 +U_CDECL_END 1.1557 + 1.1558 + 1.1559 +/** 1.1560 + * Set the find progress callback function for this URegularExpression. 1.1561 + * 1.1562 + * @param regexp The compiled regular expression. 1.1563 + * @param callback A pointer to the user-supplied callback function. 1.1564 + * @param context User context pointer. The value supplied at the 1.1565 + * time the callback function is set will be saved 1.1566 + * and passed to the callback each time that it is called. 1.1567 + * @param status A reference to a UErrorCode to receive any errors. 1.1568 + * @stable ICU 4.6 1.1569 + */ 1.1570 +U_STABLE void U_EXPORT2 1.1571 +uregex_setFindProgressCallback(URegularExpression *regexp, 1.1572 + URegexFindProgressCallback *callback, 1.1573 + const void *context, 1.1574 + UErrorCode *status); 1.1575 + 1.1576 +/** 1.1577 + * Get the find progress callback function for this URegularExpression. 1.1578 + * 1.1579 + * @param regexp The compiled regular expression. 1.1580 + * @param callback Out parameter, receives a pointer to the user-supplied 1.1581 + * callback function. 1.1582 + * @param context Out parameter, receives the user context pointer that 1.1583 + * was set when uregex_setFindProgressCallback() was called. 1.1584 + * @param status A reference to a UErrorCode to receive any errors. 1.1585 + * @stable ICU 4.6 1.1586 + */ 1.1587 +U_STABLE void U_EXPORT2 1.1588 +uregex_getFindProgressCallback(const URegularExpression *regexp, 1.1589 + URegexFindProgressCallback **callback, 1.1590 + const void **context, 1.1591 + UErrorCode *status); 1.1592 + 1.1593 +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1.1594 +#endif /* UREGEX_H */