Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2004-2013, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: uregex.h |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * indentation:4 |
michael@0 | 9 | * |
michael@0 | 10 | * created on: 2004mar09 |
michael@0 | 11 | * created by: Andy Heninger |
michael@0 | 12 | * |
michael@0 | 13 | * ICU Regular Expressions, API for C |
michael@0 | 14 | */ |
michael@0 | 15 | |
michael@0 | 16 | /** |
michael@0 | 17 | * \file |
michael@0 | 18 | * \brief C API: Regular Expressions |
michael@0 | 19 | * |
michael@0 | 20 | * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> |
michael@0 | 21 | */ |
michael@0 | 22 | |
michael@0 | 23 | #ifndef UREGEX_H |
michael@0 | 24 | #define UREGEX_H |
michael@0 | 25 | |
michael@0 | 26 | #include "unicode/utext.h" |
michael@0 | 27 | #include "unicode/utypes.h" |
michael@0 | 28 | |
michael@0 | 29 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 30 | |
michael@0 | 31 | #include "unicode/localpointer.h" |
michael@0 | 32 | #include "unicode/parseerr.h" |
michael@0 | 33 | |
michael@0 | 34 | struct URegularExpression; |
michael@0 | 35 | /** |
michael@0 | 36 | * Structure representing a compiled regular expression, plus the results |
michael@0 | 37 | * of a match operation. |
michael@0 | 38 | * @stable ICU 3.0 |
michael@0 | 39 | */ |
michael@0 | 40 | typedef struct URegularExpression URegularExpression; |
michael@0 | 41 | |
michael@0 | 42 | |
michael@0 | 43 | /** |
michael@0 | 44 | * Constants for Regular Expression Match Modes. |
michael@0 | 45 | * @stable ICU 2.4 |
michael@0 | 46 | */ |
michael@0 | 47 | typedef enum URegexpFlag{ |
michael@0 | 48 | |
michael@0 | 49 | #ifndef U_HIDE_DRAFT_API |
michael@0 | 50 | /** Forces normalization of pattern and strings. |
michael@0 | 51 | Not implemented yet, just a placeholder, hence draft. |
michael@0 | 52 | @draft ICU 2.4 */ |
michael@0 | 53 | UREGEX_CANON_EQ = 128, |
michael@0 | 54 | #endif /* U_HIDE_DRAFT_API */ |
michael@0 | 55 | /** Enable case insensitive matching. @stable ICU 2.4 */ |
michael@0 | 56 | UREGEX_CASE_INSENSITIVE = 2, |
michael@0 | 57 | |
michael@0 | 58 | /** Allow white space and comments within patterns @stable ICU 2.4 */ |
michael@0 | 59 | UREGEX_COMMENTS = 4, |
michael@0 | 60 | |
michael@0 | 61 | /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. |
michael@0 | 62 | * @stable ICU 2.4 */ |
michael@0 | 63 | UREGEX_DOTALL = 32, |
michael@0 | 64 | |
michael@0 | 65 | /** If set, treat the entire pattern as a literal string. |
michael@0 | 66 | * Metacharacters or escape sequences in the input sequence will be given |
michael@0 | 67 | * no special meaning. |
michael@0 | 68 | * |
michael@0 | 69 | * The flag UREGEX_CASE_INSENSITIVE retains its impact |
michael@0 | 70 | * on matching when used in conjunction with this flag. |
michael@0 | 71 | * The other flags become superfluous. |
michael@0 | 72 | * |
michael@0 | 73 | * @stable ICU 4.0 |
michael@0 | 74 | */ |
michael@0 | 75 | UREGEX_LITERAL = 16, |
michael@0 | 76 | |
michael@0 | 77 | /** Control behavior of "$" and "^" |
michael@0 | 78 | * If set, recognize line terminators within string, |
michael@0 | 79 | * otherwise, match only at start and end of input string. |
michael@0 | 80 | * @stable ICU 2.4 */ |
michael@0 | 81 | UREGEX_MULTILINE = 8, |
michael@0 | 82 | |
michael@0 | 83 | /** Unix-only line endings. |
michael@0 | 84 | * When this mode is enabled, only \\u000a is recognized as a line ending |
michael@0 | 85 | * in the behavior of ., ^, and $. |
michael@0 | 86 | * @stable ICU 4.0 |
michael@0 | 87 | */ |
michael@0 | 88 | UREGEX_UNIX_LINES = 1, |
michael@0 | 89 | |
michael@0 | 90 | /** Unicode word boundaries. |
michael@0 | 91 | * If set, \b uses the Unicode TR 29 definition of word boundaries. |
michael@0 | 92 | * Warning: Unicode word boundaries are quite different from |
michael@0 | 93 | * traditional regular expression word boundaries. See |
michael@0 | 94 | * http://unicode.org/reports/tr29/#Word_Boundaries |
michael@0 | 95 | * @stable ICU 2.8 |
michael@0 | 96 | */ |
michael@0 | 97 | UREGEX_UWORD = 256, |
michael@0 | 98 | |
michael@0 | 99 | /** Error on Unrecognized backslash escapes. |
michael@0 | 100 | * If set, fail with an error on patterns that contain |
michael@0 | 101 | * backslash-escaped ASCII letters without a known special |
michael@0 | 102 | * meaning. If this flag is not set, these |
michael@0 | 103 | * escaped letters represent themselves. |
michael@0 | 104 | * @stable ICU 4.0 |
michael@0 | 105 | */ |
michael@0 | 106 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 |
michael@0 | 107 | |
michael@0 | 108 | } URegexpFlag; |
michael@0 | 109 | |
michael@0 | 110 | /** |
michael@0 | 111 | * Open (compile) an ICU regular expression. Compiles the regular expression in |
michael@0 | 112 | * string form into an internal representation using the specified match mode flags. |
michael@0 | 113 | * The resulting regular expression handle can then be used to perform various |
michael@0 | 114 | * matching operations. |
michael@0 | 115 | * |
michael@0 | 116 | * |
michael@0 | 117 | * @param pattern The Regular Expression pattern to be compiled. |
michael@0 | 118 | * @param patternLength The length of the pattern, or -1 if the pattern is |
michael@0 | 119 | * NUL terminated. |
michael@0 | 120 | * @param flags Flags that alter the default matching behavior for |
michael@0 | 121 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
michael@0 | 122 | * example. For default behavior, set this parameter to zero. |
michael@0 | 123 | * See <code>enum URegexpFlag</code>. All desired flags |
michael@0 | 124 | * are bitwise-ORed together. |
michael@0 | 125 | * @param pe Receives the position (line and column numbers) of any syntax |
michael@0 | 126 | * error within the source regular expression string. If this |
michael@0 | 127 | * information is not wanted, pass NULL for this parameter. |
michael@0 | 128 | * @param status Receives error detected by this function. |
michael@0 | 129 | * @stable ICU 3.0 |
michael@0 | 130 | * |
michael@0 | 131 | */ |
michael@0 | 132 | U_STABLE URegularExpression * U_EXPORT2 |
michael@0 | 133 | uregex_open( const UChar *pattern, |
michael@0 | 134 | int32_t patternLength, |
michael@0 | 135 | uint32_t flags, |
michael@0 | 136 | UParseError *pe, |
michael@0 | 137 | UErrorCode *status); |
michael@0 | 138 | |
michael@0 | 139 | /** |
michael@0 | 140 | * Open (compile) an ICU regular expression. Compiles the regular expression in |
michael@0 | 141 | * string form into an internal representation using the specified match mode flags. |
michael@0 | 142 | * The resulting regular expression handle can then be used to perform various |
michael@0 | 143 | * matching operations. |
michael@0 | 144 | * <p> |
michael@0 | 145 | * The contents of the pattern UText will be extracted and saved. Ownership of the |
michael@0 | 146 | * UText struct itself remains with the caller. This is to match the behavior of |
michael@0 | 147 | * uregex_open(). |
michael@0 | 148 | * |
michael@0 | 149 | * @param pattern The Regular Expression pattern to be compiled. |
michael@0 | 150 | * @param flags Flags that alter the default matching behavior for |
michael@0 | 151 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
michael@0 | 152 | * example. For default behavior, set this parameter to zero. |
michael@0 | 153 | * See <code>enum URegexpFlag</code>. All desired flags |
michael@0 | 154 | * are bitwise-ORed together. |
michael@0 | 155 | * @param pe Receives the position (line and column numbers) of any syntax |
michael@0 | 156 | * error within the source regular expression string. If this |
michael@0 | 157 | * information is not wanted, pass NULL for this parameter. |
michael@0 | 158 | * @param status Receives error detected by this function. |
michael@0 | 159 | * |
michael@0 | 160 | * @stable ICU 4.6 |
michael@0 | 161 | */ |
michael@0 | 162 | U_STABLE URegularExpression * U_EXPORT2 |
michael@0 | 163 | uregex_openUText(UText *pattern, |
michael@0 | 164 | uint32_t flags, |
michael@0 | 165 | UParseError *pe, |
michael@0 | 166 | UErrorCode *status); |
michael@0 | 167 | |
michael@0 | 168 | /** |
michael@0 | 169 | * Open (compile) an ICU regular expression. The resulting regular expression |
michael@0 | 170 | * handle can then be used to perform various matching operations. |
michael@0 | 171 | * <p> |
michael@0 | 172 | * This function is the same as uregex_open, except that the pattern |
michael@0 | 173 | * is supplied as an 8 bit char * string in the default code page. |
michael@0 | 174 | * |
michael@0 | 175 | * @param pattern The Regular Expression pattern to be compiled, |
michael@0 | 176 | * NUL terminated. |
michael@0 | 177 | * @param flags Flags that alter the default matching behavior for |
michael@0 | 178 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
michael@0 | 179 | * example. For default behavior, set this parameter to zero. |
michael@0 | 180 | * See <code>enum URegexpFlag</code>. All desired flags |
michael@0 | 181 | * are bitwise-ORed together. |
michael@0 | 182 | * @param pe Receives the position (line and column numbers) of any syntax |
michael@0 | 183 | * error within the source regular expression string. If this |
michael@0 | 184 | * information is not wanted, pass NULL for this parameter. |
michael@0 | 185 | * @param status Receives errors detected by this function. |
michael@0 | 186 | * @return The URegularExpression object representing the compiled |
michael@0 | 187 | * pattern. |
michael@0 | 188 | * |
michael@0 | 189 | * @stable ICU 3.0 |
michael@0 | 190 | */ |
michael@0 | 191 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 192 | U_STABLE URegularExpression * U_EXPORT2 |
michael@0 | 193 | uregex_openC( const char *pattern, |
michael@0 | 194 | uint32_t flags, |
michael@0 | 195 | UParseError *pe, |
michael@0 | 196 | UErrorCode *status); |
michael@0 | 197 | #endif |
michael@0 | 198 | |
michael@0 | 199 | |
michael@0 | 200 | |
michael@0 | 201 | /** |
michael@0 | 202 | * Close the regular expression, recovering all resources (memory) it |
michael@0 | 203 | * was holding. |
michael@0 | 204 | * |
michael@0 | 205 | * @param regexp The regular expression to be closed. |
michael@0 | 206 | * @stable ICU 3.0 |
michael@0 | 207 | */ |
michael@0 | 208 | U_STABLE void U_EXPORT2 |
michael@0 | 209 | uregex_close(URegularExpression *regexp); |
michael@0 | 210 | |
michael@0 | 211 | #if U_SHOW_CPLUSPLUS_API |
michael@0 | 212 | |
michael@0 | 213 | U_NAMESPACE_BEGIN |
michael@0 | 214 | |
michael@0 | 215 | /** |
michael@0 | 216 | * \class LocalURegularExpressionPointer |
michael@0 | 217 | * "Smart pointer" class, closes a URegularExpression via uregex_close(). |
michael@0 | 218 | * For most methods see the LocalPointerBase base class. |
michael@0 | 219 | * |
michael@0 | 220 | * @see LocalPointerBase |
michael@0 | 221 | * @see LocalPointer |
michael@0 | 222 | * @stable ICU 4.4 |
michael@0 | 223 | */ |
michael@0 | 224 | U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); |
michael@0 | 225 | |
michael@0 | 226 | U_NAMESPACE_END |
michael@0 | 227 | |
michael@0 | 228 | #endif |
michael@0 | 229 | |
michael@0 | 230 | /** |
michael@0 | 231 | * Make a copy of a compiled regular expression. Cloning a regular |
michael@0 | 232 | * expression is faster than opening a second instance from the source |
michael@0 | 233 | * form of the expression, and requires less memory. |
michael@0 | 234 | * <p> |
michael@0 | 235 | * Note that the current input string and the position of any matched text |
michael@0 | 236 | * within it are not cloned; only the pattern itself and the |
michael@0 | 237 | * match mode flags are copied. |
michael@0 | 238 | * <p> |
michael@0 | 239 | * Cloning can be particularly useful to threaded applications that perform |
michael@0 | 240 | * multiple match operations in parallel. Each concurrent RE |
michael@0 | 241 | * operation requires its own instance of a URegularExpression. |
michael@0 | 242 | * |
michael@0 | 243 | * @param regexp The compiled regular expression to be cloned. |
michael@0 | 244 | * @param status Receives indication of any errors encountered |
michael@0 | 245 | * @return the cloned copy of the compiled regular expression. |
michael@0 | 246 | * @stable ICU 3.0 |
michael@0 | 247 | */ |
michael@0 | 248 | U_STABLE URegularExpression * U_EXPORT2 |
michael@0 | 249 | uregex_clone(const URegularExpression *regexp, UErrorCode *status); |
michael@0 | 250 | |
michael@0 | 251 | /** |
michael@0 | 252 | * Returns a pointer to the source form of the pattern for this regular expression. |
michael@0 | 253 | * This function will work even if the pattern was originally specified as a UText. |
michael@0 | 254 | * |
michael@0 | 255 | * @param regexp The compiled regular expression. |
michael@0 | 256 | * @param patLength This output parameter will be set to the length of the |
michael@0 | 257 | * pattern string. A NULL pointer may be used here if the |
michael@0 | 258 | * pattern length is not needed, as would be the case if |
michael@0 | 259 | * the pattern is known in advance to be a NUL terminated |
michael@0 | 260 | * string. |
michael@0 | 261 | * @param status Receives errors detected by this function. |
michael@0 | 262 | * @return a pointer to the pattern string. The storage for the string is |
michael@0 | 263 | * owned by the regular expression object, and must not be |
michael@0 | 264 | * altered or deleted by the application. The returned string |
michael@0 | 265 | * will remain valid until the regular expression is closed. |
michael@0 | 266 | * @stable ICU 3.0 |
michael@0 | 267 | */ |
michael@0 | 268 | U_STABLE const UChar * U_EXPORT2 |
michael@0 | 269 | uregex_pattern(const URegularExpression *regexp, |
michael@0 | 270 | int32_t *patLength, |
michael@0 | 271 | UErrorCode *status); |
michael@0 | 272 | |
michael@0 | 273 | /** |
michael@0 | 274 | * Returns the source text of the pattern for this regular expression. |
michael@0 | 275 | * This function will work even if the pattern was originally specified as a UChar string. |
michael@0 | 276 | * |
michael@0 | 277 | * @param regexp The compiled regular expression. |
michael@0 | 278 | * @param status Receives errors detected by this function. |
michael@0 | 279 | * @return the pattern text. The storage for the text is owned by the regular expression |
michael@0 | 280 | * object, and must not be altered or deleted. |
michael@0 | 281 | * |
michael@0 | 282 | * @stable ICU 4.6 |
michael@0 | 283 | */ |
michael@0 | 284 | U_STABLE UText * U_EXPORT2 |
michael@0 | 285 | uregex_patternUText(const URegularExpression *regexp, |
michael@0 | 286 | UErrorCode *status); |
michael@0 | 287 | |
michael@0 | 288 | /** |
michael@0 | 289 | * Get the match mode flags that were specified when compiling this regular expression. |
michael@0 | 290 | * @param status Receives errors detected by this function. |
michael@0 | 291 | * @param regexp The compiled regular expression. |
michael@0 | 292 | * @return The match mode flags |
michael@0 | 293 | * @see URegexpFlag |
michael@0 | 294 | * @stable ICU 3.0 |
michael@0 | 295 | */ |
michael@0 | 296 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 297 | uregex_flags(const URegularExpression *regexp, |
michael@0 | 298 | UErrorCode *status); |
michael@0 | 299 | |
michael@0 | 300 | |
michael@0 | 301 | /** |
michael@0 | 302 | * Set the subject text string upon which the regular expression will look for matches. |
michael@0 | 303 | * This function may be called any number of times, allowing the regular |
michael@0 | 304 | * expression pattern to be applied to different strings. |
michael@0 | 305 | * <p> |
michael@0 | 306 | * Regular expression matching operations work directly on the application's |
michael@0 | 307 | * string data. No copy is made. The subject string data must not be |
michael@0 | 308 | * altered after calling this function until after all regular expression |
michael@0 | 309 | * operations involving this string data are completed. |
michael@0 | 310 | * <p> |
michael@0 | 311 | * Zero length strings are permitted. In this case, no subsequent match |
michael@0 | 312 | * operation will dereference the text string pointer. |
michael@0 | 313 | * |
michael@0 | 314 | * @param regexp The compiled regular expression. |
michael@0 | 315 | * @param text The subject text string. |
michael@0 | 316 | * @param textLength The length of the subject text, or -1 if the string |
michael@0 | 317 | * is NUL terminated. |
michael@0 | 318 | * @param status Receives errors detected by this function. |
michael@0 | 319 | * @stable ICU 3.0 |
michael@0 | 320 | */ |
michael@0 | 321 | U_STABLE void U_EXPORT2 |
michael@0 | 322 | uregex_setText(URegularExpression *regexp, |
michael@0 | 323 | const UChar *text, |
michael@0 | 324 | int32_t textLength, |
michael@0 | 325 | UErrorCode *status); |
michael@0 | 326 | |
michael@0 | 327 | |
michael@0 | 328 | /** |
michael@0 | 329 | * Set the subject text string upon which the regular expression will look for matches. |
michael@0 | 330 | * This function may be called any number of times, allowing the regular |
michael@0 | 331 | * expression pattern to be applied to different strings. |
michael@0 | 332 | * <p> |
michael@0 | 333 | * Regular expression matching operations work directly on the application's |
michael@0 | 334 | * string data; only a shallow clone is made. The subject string data must not be |
michael@0 | 335 | * altered after calling this function until after all regular expression |
michael@0 | 336 | * operations involving this string data are completed. |
michael@0 | 337 | * |
michael@0 | 338 | * @param regexp The compiled regular expression. |
michael@0 | 339 | * @param text The subject text string. |
michael@0 | 340 | * @param status Receives errors detected by this function. |
michael@0 | 341 | * |
michael@0 | 342 | * @stable ICU 4.6 |
michael@0 | 343 | */ |
michael@0 | 344 | U_STABLE void U_EXPORT2 |
michael@0 | 345 | uregex_setUText(URegularExpression *regexp, |
michael@0 | 346 | UText *text, |
michael@0 | 347 | UErrorCode *status); |
michael@0 | 348 | |
michael@0 | 349 | /** |
michael@0 | 350 | * Get the subject text that is currently associated with this |
michael@0 | 351 | * regular expression object. If the input was supplied using uregex_setText(), |
michael@0 | 352 | * that pointer will be returned. Otherwise, the characters in the input will |
michael@0 | 353 | * be extracted to a buffer and returned. In either case, ownership remains |
michael@0 | 354 | * with the regular expression object. |
michael@0 | 355 | * |
michael@0 | 356 | * This function will work even if the input was originally specified as a UText. |
michael@0 | 357 | * |
michael@0 | 358 | * @param regexp The compiled regular expression. |
michael@0 | 359 | * @param textLength The length of the string is returned in this output parameter. |
michael@0 | 360 | * A NULL pointer may be used here if the |
michael@0 | 361 | * text length is not needed, as would be the case if |
michael@0 | 362 | * the text is known in advance to be a NUL terminated |
michael@0 | 363 | * string. |
michael@0 | 364 | * @param status Receives errors detected by this function. |
michael@0 | 365 | * @return Pointer to the subject text string currently associated with |
michael@0 | 366 | * this regular expression. |
michael@0 | 367 | * @stable ICU 3.0 |
michael@0 | 368 | */ |
michael@0 | 369 | U_STABLE const UChar * U_EXPORT2 |
michael@0 | 370 | uregex_getText(URegularExpression *regexp, |
michael@0 | 371 | int32_t *textLength, |
michael@0 | 372 | UErrorCode *status); |
michael@0 | 373 | |
michael@0 | 374 | /** |
michael@0 | 375 | * Get the subject text that is currently associated with this |
michael@0 | 376 | * regular expression object. |
michael@0 | 377 | * |
michael@0 | 378 | * This function will work even if the input was originally specified as a UChar string. |
michael@0 | 379 | * |
michael@0 | 380 | * @param regexp The compiled regular expression. |
michael@0 | 381 | * @param dest A mutable UText in which to store the current input. |
michael@0 | 382 | * If NULL, a new UText will be created as an immutable shallow clone |
michael@0 | 383 | * of the actual input string. |
michael@0 | 384 | * @param status Receives errors detected by this function. |
michael@0 | 385 | * @return The subject text currently associated with this regular expression. |
michael@0 | 386 | * If a pre-allocated UText was provided, it will always be used and returned. |
michael@0 | 387 | * |
michael@0 | 388 | * @stable ICU 4.6 |
michael@0 | 389 | */ |
michael@0 | 390 | U_STABLE UText * U_EXPORT2 |
michael@0 | 391 | uregex_getUText(URegularExpression *regexp, |
michael@0 | 392 | UText *dest, |
michael@0 | 393 | UErrorCode *status); |
michael@0 | 394 | |
michael@0 | 395 | /** |
michael@0 | 396 | * Set the subject text string upon which the regular expression is looking for matches |
michael@0 | 397 | * without changing any other aspect of the matching state. |
michael@0 | 398 | * The new and previous text strings must have the same content. |
michael@0 | 399 | * |
michael@0 | 400 | * This function is intended for use in environments where ICU is operating on |
michael@0 | 401 | * strings that may move around in memory. It provides a mechanism for notifying |
michael@0 | 402 | * ICU that the string has been relocated, and providing a new UText to access the |
michael@0 | 403 | * string in its new position. |
michael@0 | 404 | * |
michael@0 | 405 | * Note that the regular expression implementation never copies the underlying text |
michael@0 | 406 | * of a string being matched, but always operates directly on the original text |
michael@0 | 407 | * provided by the user. Refreshing simply drops the references to the old text |
michael@0 | 408 | * and replaces them with references to the new. |
michael@0 | 409 | * |
michael@0 | 410 | * Caution: this function is normally used only by very specialized |
michael@0 | 411 | * system-level code. One example use case is with garbage collection |
michael@0 | 412 | * that moves the text in memory. |
michael@0 | 413 | * |
michael@0 | 414 | * @param regexp The compiled regular expression. |
michael@0 | 415 | * @param text The new (moved) text string. |
michael@0 | 416 | * @param status Receives errors detected by this function. |
michael@0 | 417 | * |
michael@0 | 418 | * @stable ICU 4.8 |
michael@0 | 419 | */ |
michael@0 | 420 | U_STABLE void U_EXPORT2 |
michael@0 | 421 | uregex_refreshUText(URegularExpression *regexp, |
michael@0 | 422 | UText *text, |
michael@0 | 423 | UErrorCode *status); |
michael@0 | 424 | |
michael@0 | 425 | /** |
michael@0 | 426 | * Attempts to match the input string against the pattern. |
michael@0 | 427 | * To succeed, the match must extend to the end of the string, |
michael@0 | 428 | * or cover the complete match region. |
michael@0 | 429 | * |
michael@0 | 430 | * If startIndex >= zero the match operation starts at the specified |
michael@0 | 431 | * index and must extend to the end of the input string. Any region |
michael@0 | 432 | * that has been specified is reset. |
michael@0 | 433 | * |
michael@0 | 434 | * If startIndex == -1 the match must cover the input region, or the entire |
michael@0 | 435 | * input string if no region has been set. This directly corresponds to |
michael@0 | 436 | * Matcher.matches() in Java |
michael@0 | 437 | * |
michael@0 | 438 | * @param regexp The compiled regular expression. |
michael@0 | 439 | * @param startIndex The input string (native) index at which to begin matching, or -1 |
michael@0 | 440 | * to match the input Region. |
michael@0 | 441 | * @param status Receives errors detected by this function. |
michael@0 | 442 | * @return TRUE if there is a match |
michael@0 | 443 | * @stable ICU 3.0 |
michael@0 | 444 | */ |
michael@0 | 445 | U_STABLE UBool U_EXPORT2 |
michael@0 | 446 | uregex_matches(URegularExpression *regexp, |
michael@0 | 447 | int32_t startIndex, |
michael@0 | 448 | UErrorCode *status); |
michael@0 | 449 | |
michael@0 | 450 | /** |
michael@0 | 451 | * 64bit version of uregex_matches. |
michael@0 | 452 | * Attempts to match the input string against the pattern. |
michael@0 | 453 | * To succeed, the match must extend to the end of the string, |
michael@0 | 454 | * or cover the complete match region. |
michael@0 | 455 | * |
michael@0 | 456 | * If startIndex >= zero the match operation starts at the specified |
michael@0 | 457 | * index and must extend to the end of the input string. Any region |
michael@0 | 458 | * that has been specified is reset. |
michael@0 | 459 | * |
michael@0 | 460 | * If startIndex == -1 the match must cover the input region, or the entire |
michael@0 | 461 | * input string if no region has been set. This directly corresponds to |
michael@0 | 462 | * Matcher.matches() in Java |
michael@0 | 463 | * |
michael@0 | 464 | * @param regexp The compiled regular expression. |
michael@0 | 465 | * @param startIndex The input string (native) index at which to begin matching, or -1 |
michael@0 | 466 | * to match the input Region. |
michael@0 | 467 | * @param status Receives errors detected by this function. |
michael@0 | 468 | * @return TRUE if there is a match |
michael@0 | 469 | * @stable ICU 4.6 |
michael@0 | 470 | */ |
michael@0 | 471 | U_STABLE UBool U_EXPORT2 |
michael@0 | 472 | uregex_matches64(URegularExpression *regexp, |
michael@0 | 473 | int64_t startIndex, |
michael@0 | 474 | UErrorCode *status); |
michael@0 | 475 | |
michael@0 | 476 | /** |
michael@0 | 477 | * Attempts to match the input string, starting from the specified index, against the pattern. |
michael@0 | 478 | * The match may be of any length, and is not required to extend to the end |
michael@0 | 479 | * of the input string. Contrast with uregex_matches(). |
michael@0 | 480 | * |
michael@0 | 481 | * <p>If startIndex is >= 0 any input region that was set for this |
michael@0 | 482 | * URegularExpression is reset before the operation begins. |
michael@0 | 483 | * |
michael@0 | 484 | * <p>If the specified starting index == -1 the match begins at the start of the input |
michael@0 | 485 | * region, or at the start of the full string if no region has been specified. |
michael@0 | 486 | * This corresponds directly with Matcher.lookingAt() in Java. |
michael@0 | 487 | * |
michael@0 | 488 | * <p>If the match succeeds then more information can be obtained via the |
michael@0 | 489 | * <code>uregexp_start()</code>, <code>uregexp_end()</code>, |
michael@0 | 490 | * and <code>uregexp_group()</code> functions.</p> |
michael@0 | 491 | * |
michael@0 | 492 | * @param regexp The compiled regular expression. |
michael@0 | 493 | * @param startIndex The input string (native) index at which to begin matching, or |
michael@0 | 494 | * -1 to match the Input Region |
michael@0 | 495 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 496 | * @return TRUE if there is a match. |
michael@0 | 497 | * @stable ICU 3.0 |
michael@0 | 498 | */ |
michael@0 | 499 | U_STABLE UBool U_EXPORT2 |
michael@0 | 500 | uregex_lookingAt(URegularExpression *regexp, |
michael@0 | 501 | int32_t startIndex, |
michael@0 | 502 | UErrorCode *status); |
michael@0 | 503 | |
michael@0 | 504 | /** |
michael@0 | 505 | * 64bit version of uregex_lookingAt. |
michael@0 | 506 | * Attempts to match the input string, starting from the specified index, against the pattern. |
michael@0 | 507 | * The match may be of any length, and is not required to extend to the end |
michael@0 | 508 | * of the input string. Contrast with uregex_matches(). |
michael@0 | 509 | * |
michael@0 | 510 | * <p>If startIndex is >= 0 any input region that was set for this |
michael@0 | 511 | * URegularExpression is reset before the operation begins. |
michael@0 | 512 | * |
michael@0 | 513 | * <p>If the specified starting index == -1 the match begins at the start of the input |
michael@0 | 514 | * region, or at the start of the full string if no region has been specified. |
michael@0 | 515 | * This corresponds directly with Matcher.lookingAt() in Java. |
michael@0 | 516 | * |
michael@0 | 517 | * <p>If the match succeeds then more information can be obtained via the |
michael@0 | 518 | * <code>uregexp_start()</code>, <code>uregexp_end()</code>, |
michael@0 | 519 | * and <code>uregexp_group()</code> functions.</p> |
michael@0 | 520 | * |
michael@0 | 521 | * @param regexp The compiled regular expression. |
michael@0 | 522 | * @param startIndex The input string (native) index at which to begin matching, or |
michael@0 | 523 | * -1 to match the Input Region |
michael@0 | 524 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 525 | * @return TRUE if there is a match. |
michael@0 | 526 | * @stable ICU 4.6 |
michael@0 | 527 | */ |
michael@0 | 528 | U_STABLE UBool U_EXPORT2 |
michael@0 | 529 | uregex_lookingAt64(URegularExpression *regexp, |
michael@0 | 530 | int64_t startIndex, |
michael@0 | 531 | UErrorCode *status); |
michael@0 | 532 | |
michael@0 | 533 | /** |
michael@0 | 534 | * Find the first matching substring of the input string that matches the pattern. |
michael@0 | 535 | * If startIndex is >= zero the search for a match begins at the specified index, |
michael@0 | 536 | * and any match region is reset. This corresponds directly with |
michael@0 | 537 | * Matcher.find(startIndex) in Java. |
michael@0 | 538 | * |
michael@0 | 539 | * If startIndex == -1 the search begins at the start of the input region, |
michael@0 | 540 | * or at the start of the full string if no region has been specified. |
michael@0 | 541 | * |
michael@0 | 542 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and |
michael@0 | 543 | * <code>uregex_group()</code> will provide more information regarding the match. |
michael@0 | 544 | * |
michael@0 | 545 | * @param regexp The compiled regular expression. |
michael@0 | 546 | * @param startIndex The position (native) in the input string to begin the search, or |
michael@0 | 547 | * -1 to search within the Input Region. |
michael@0 | 548 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 549 | * @return TRUE if a match is found. |
michael@0 | 550 | * @stable ICU 3.0 |
michael@0 | 551 | */ |
michael@0 | 552 | U_STABLE UBool U_EXPORT2 |
michael@0 | 553 | uregex_find(URegularExpression *regexp, |
michael@0 | 554 | int32_t startIndex, |
michael@0 | 555 | UErrorCode *status); |
michael@0 | 556 | |
michael@0 | 557 | /** |
michael@0 | 558 | * 64bit version of uregex_find. |
michael@0 | 559 | * Find the first matching substring of the input string that matches the pattern. |
michael@0 | 560 | * If startIndex is >= zero the search for a match begins at the specified index, |
michael@0 | 561 | * and any match region is reset. This corresponds directly with |
michael@0 | 562 | * Matcher.find(startIndex) in Java. |
michael@0 | 563 | * |
michael@0 | 564 | * If startIndex == -1 the search begins at the start of the input region, |
michael@0 | 565 | * or at the start of the full string if no region has been specified. |
michael@0 | 566 | * |
michael@0 | 567 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and |
michael@0 | 568 | * <code>uregex_group()</code> will provide more information regarding the match. |
michael@0 | 569 | * |
michael@0 | 570 | * @param regexp The compiled regular expression. |
michael@0 | 571 | * @param startIndex The position (native) in the input string to begin the search, or |
michael@0 | 572 | * -1 to search within the Input Region. |
michael@0 | 573 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 574 | * @return TRUE if a match is found. |
michael@0 | 575 | * @stable ICU 4.6 |
michael@0 | 576 | */ |
michael@0 | 577 | U_STABLE UBool U_EXPORT2 |
michael@0 | 578 | uregex_find64(URegularExpression *regexp, |
michael@0 | 579 | int64_t startIndex, |
michael@0 | 580 | UErrorCode *status); |
michael@0 | 581 | |
michael@0 | 582 | /** |
michael@0 | 583 | * Find the next pattern match in the input string. Begin searching |
michael@0 | 584 | * the input at the location following the end of he previous match, |
michael@0 | 585 | * or at the start of the string (or region) if there is no |
michael@0 | 586 | * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and |
michael@0 | 587 | * <code>uregex_group()</code> will provide more information regarding the match. |
michael@0 | 588 | * |
michael@0 | 589 | * @param regexp The compiled regular expression. |
michael@0 | 590 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 591 | * @return TRUE if a match is found. |
michael@0 | 592 | * @see uregex_reset |
michael@0 | 593 | * @stable ICU 3.0 |
michael@0 | 594 | */ |
michael@0 | 595 | U_STABLE UBool U_EXPORT2 |
michael@0 | 596 | uregex_findNext(URegularExpression *regexp, |
michael@0 | 597 | UErrorCode *status); |
michael@0 | 598 | |
michael@0 | 599 | /** |
michael@0 | 600 | * Get the number of capturing groups in this regular expression's pattern. |
michael@0 | 601 | * @param regexp The compiled regular expression. |
michael@0 | 602 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 603 | * @return the number of capture groups |
michael@0 | 604 | * @stable ICU 3.0 |
michael@0 | 605 | */ |
michael@0 | 606 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 607 | uregex_groupCount(URegularExpression *regexp, |
michael@0 | 608 | UErrorCode *status); |
michael@0 | 609 | |
michael@0 | 610 | /** Extract the string for the specified matching expression or subexpression. |
michael@0 | 611 | * Group #0 is the complete string of matched text. |
michael@0 | 612 | * Group #1 is the text matched by the first set of capturing parentheses. |
michael@0 | 613 | * |
michael@0 | 614 | * @param regexp The compiled regular expression. |
michael@0 | 615 | * @param groupNum The capture group to extract. Group 0 is the complete |
michael@0 | 616 | * match. The value of this parameter must be |
michael@0 | 617 | * less than or equal to the number of capture groups in |
michael@0 | 618 | * the pattern. |
michael@0 | 619 | * @param dest Buffer to receive the matching string data |
michael@0 | 620 | * @param destCapacity Capacity of the dest buffer. |
michael@0 | 621 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 622 | * @return Length of matching data, |
michael@0 | 623 | * or -1 if no applicable match. |
michael@0 | 624 | * @stable ICU 3.0 |
michael@0 | 625 | */ |
michael@0 | 626 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 627 | uregex_group(URegularExpression *regexp, |
michael@0 | 628 | int32_t groupNum, |
michael@0 | 629 | UChar *dest, |
michael@0 | 630 | int32_t destCapacity, |
michael@0 | 631 | UErrorCode *status); |
michael@0 | 632 | |
michael@0 | 633 | /** Returns a shallow immutable clone of the entire input string. The returned UText current native index |
michael@0 | 634 | * is set to the beginning of the requested capture group. The capture group length is also |
michael@0 | 635 | * returned via groupLength. |
michael@0 | 636 | * Group #0 is the complete string of matched text. |
michael@0 | 637 | * Group #1 is the text matched by the first set of capturing parentheses. |
michael@0 | 638 | * |
michael@0 | 639 | * @param regexp The compiled regular expression. |
michael@0 | 640 | * @param groupNum The capture group to extract. Group 0 is the complete |
michael@0 | 641 | * match. The value of this parameter must be |
michael@0 | 642 | * less than or equal to the number of capture groups in |
michael@0 | 643 | * the pattern. |
michael@0 | 644 | * @param dest A mutable UText in which to store the current input. |
michael@0 | 645 | * If NULL, a new UText will be created as an immutable shallow clone |
michael@0 | 646 | * of the entire input string. |
michael@0 | 647 | * @param groupLength The group length of the desired capture group. |
michael@0 | 648 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 649 | * @return The subject text currently associated with this regular expression. |
michael@0 | 650 | * If a pre-allocated UText was provided, it will always be used and returned. |
michael@0 | 651 | |
michael@0 | 652 | * |
michael@0 | 653 | * @stable ICU 4.6 |
michael@0 | 654 | */ |
michael@0 | 655 | U_STABLE UText * U_EXPORT2 |
michael@0 | 656 | uregex_groupUText(URegularExpression *regexp, |
michael@0 | 657 | int32_t groupNum, |
michael@0 | 658 | UText *dest, |
michael@0 | 659 | int64_t *groupLength, |
michael@0 | 660 | UErrorCode *status); |
michael@0 | 661 | |
michael@0 | 662 | #ifndef U_HIDE_INTERNAL_API |
michael@0 | 663 | /** Extract the string for the specified matching expression or subexpression. |
michael@0 | 664 | * Group #0 is the complete string of matched text. |
michael@0 | 665 | * Group #1 is the text matched by the first set of capturing parentheses. |
michael@0 | 666 | * |
michael@0 | 667 | * @param regexp The compiled regular expression. |
michael@0 | 668 | * @param groupNum The capture group to extract. Group 0 is the complete |
michael@0 | 669 | * match. The value of this parameter must be |
michael@0 | 670 | * less than or equal to the number of capture groups in |
michael@0 | 671 | * the pattern. |
michael@0 | 672 | * @param dest Mutable UText to receive the matching string data. |
michael@0 | 673 | * If NULL, a new UText will be created (which may not be mutable). |
michael@0 | 674 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 675 | * @return The matching string data. If a pre-allocated UText was provided, |
michael@0 | 676 | * it will always be used and returned. |
michael@0 | 677 | * |
michael@0 | 678 | * @internal ICU 4.4 technology preview |
michael@0 | 679 | */ |
michael@0 | 680 | U_INTERNAL UText * U_EXPORT2 |
michael@0 | 681 | uregex_groupUTextDeep(URegularExpression *regexp, |
michael@0 | 682 | int32_t groupNum, |
michael@0 | 683 | UText *dest, |
michael@0 | 684 | UErrorCode *status); |
michael@0 | 685 | #endif /* U_HIDE_INTERNAL_API */ |
michael@0 | 686 | |
michael@0 | 687 | /** |
michael@0 | 688 | * Returns the index in the input string of the start of the text matched by the |
michael@0 | 689 | * specified capture group during the previous match operation. Return -1 if |
michael@0 | 690 | * the capture group was not part of the last match. |
michael@0 | 691 | * Group #0 refers to the complete range of matched text. |
michael@0 | 692 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
michael@0 | 693 | * |
michael@0 | 694 | * @param regexp The compiled regular expression. |
michael@0 | 695 | * @param groupNum The capture group number |
michael@0 | 696 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 697 | * @return the starting (native) position in the input of the text matched |
michael@0 | 698 | * by the specified group. |
michael@0 | 699 | * @stable ICU 3.0 |
michael@0 | 700 | */ |
michael@0 | 701 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 702 | uregex_start(URegularExpression *regexp, |
michael@0 | 703 | int32_t groupNum, |
michael@0 | 704 | UErrorCode *status); |
michael@0 | 705 | |
michael@0 | 706 | /** |
michael@0 | 707 | * 64bit version of uregex_start. |
michael@0 | 708 | * Returns the index in the input string of the start of the text matched by the |
michael@0 | 709 | * specified capture group during the previous match operation. Return -1 if |
michael@0 | 710 | * the capture group was not part of the last match. |
michael@0 | 711 | * Group #0 refers to the complete range of matched text. |
michael@0 | 712 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
michael@0 | 713 | * |
michael@0 | 714 | * @param regexp The compiled regular expression. |
michael@0 | 715 | * @param groupNum The capture group number |
michael@0 | 716 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 717 | * @return the starting (native) position in the input of the text matched |
michael@0 | 718 | * by the specified group. |
michael@0 | 719 | * @stable ICU 4.6 |
michael@0 | 720 | */ |
michael@0 | 721 | U_STABLE int64_t U_EXPORT2 |
michael@0 | 722 | uregex_start64(URegularExpression *regexp, |
michael@0 | 723 | int32_t groupNum, |
michael@0 | 724 | UErrorCode *status); |
michael@0 | 725 | |
michael@0 | 726 | /** |
michael@0 | 727 | * Returns the index in the input string of the position following the end |
michael@0 | 728 | * of the text matched by the specified capture group. |
michael@0 | 729 | * Return -1 if the capture group was not part of the last match. |
michael@0 | 730 | * Group #0 refers to the complete range of matched text. |
michael@0 | 731 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
michael@0 | 732 | * |
michael@0 | 733 | * @param regexp The compiled regular expression. |
michael@0 | 734 | * @param groupNum The capture group number |
michael@0 | 735 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 736 | * @return the (native) index of the position following the last matched character. |
michael@0 | 737 | * @stable ICU 3.0 |
michael@0 | 738 | */ |
michael@0 | 739 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 740 | uregex_end(URegularExpression *regexp, |
michael@0 | 741 | int32_t groupNum, |
michael@0 | 742 | UErrorCode *status); |
michael@0 | 743 | |
michael@0 | 744 | /** |
michael@0 | 745 | * 64bit version of uregex_end. |
michael@0 | 746 | * Returns the index in the input string of the position following the end |
michael@0 | 747 | * of the text matched by the specified capture group. |
michael@0 | 748 | * Return -1 if the capture group was not part of the last match. |
michael@0 | 749 | * Group #0 refers to the complete range of matched text. |
michael@0 | 750 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
michael@0 | 751 | * |
michael@0 | 752 | * @param regexp The compiled regular expression. |
michael@0 | 753 | * @param groupNum The capture group number |
michael@0 | 754 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 755 | * @return the (native) index of the position following the last matched character. |
michael@0 | 756 | * @stable ICU 4.6 |
michael@0 | 757 | */ |
michael@0 | 758 | U_STABLE int64_t U_EXPORT2 |
michael@0 | 759 | uregex_end64(URegularExpression *regexp, |
michael@0 | 760 | int32_t groupNum, |
michael@0 | 761 | UErrorCode *status); |
michael@0 | 762 | |
michael@0 | 763 | /** |
michael@0 | 764 | * Reset any saved state from the previous match. Has the effect of |
michael@0 | 765 | * causing uregex_findNext to begin at the specified index, and causing |
michael@0 | 766 | * uregex_start(), uregex_end() and uregex_group() to return an error |
michael@0 | 767 | * indicating that there is no match information available. Clears any |
michael@0 | 768 | * match region that may have been set. |
michael@0 | 769 | * |
michael@0 | 770 | * @param regexp The compiled regular expression. |
michael@0 | 771 | * @param index The position (native) in the text at which a |
michael@0 | 772 | * uregex_findNext() should begin searching. |
michael@0 | 773 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 774 | * @stable ICU 3.0 |
michael@0 | 775 | */ |
michael@0 | 776 | U_STABLE void U_EXPORT2 |
michael@0 | 777 | uregex_reset(URegularExpression *regexp, |
michael@0 | 778 | int32_t index, |
michael@0 | 779 | UErrorCode *status); |
michael@0 | 780 | |
michael@0 | 781 | /** |
michael@0 | 782 | * 64bit version of uregex_reset. |
michael@0 | 783 | * Reset any saved state from the previous match. Has the effect of |
michael@0 | 784 | * causing uregex_findNext to begin at the specified index, and causing |
michael@0 | 785 | * uregex_start(), uregex_end() and uregex_group() to return an error |
michael@0 | 786 | * indicating that there is no match information available. Clears any |
michael@0 | 787 | * match region that may have been set. |
michael@0 | 788 | * |
michael@0 | 789 | * @param regexp The compiled regular expression. |
michael@0 | 790 | * @param index The position (native) in the text at which a |
michael@0 | 791 | * uregex_findNext() should begin searching. |
michael@0 | 792 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 793 | * @stable ICU 4.6 |
michael@0 | 794 | */ |
michael@0 | 795 | U_STABLE void U_EXPORT2 |
michael@0 | 796 | uregex_reset64(URegularExpression *regexp, |
michael@0 | 797 | int64_t index, |
michael@0 | 798 | UErrorCode *status); |
michael@0 | 799 | |
michael@0 | 800 | /** |
michael@0 | 801 | * Sets the limits of the matching region for this URegularExpression. |
michael@0 | 802 | * The region is the part of the input string that will be considered when matching. |
michael@0 | 803 | * Invoking this method resets any saved state from the previous match, |
michael@0 | 804 | * then sets the region to start at the index specified by the start parameter |
michael@0 | 805 | * and end at the index specified by the end parameter. |
michael@0 | 806 | * |
michael@0 | 807 | * Depending on the transparency and anchoring being used (see useTransparentBounds |
michael@0 | 808 | * and useAnchoringBounds), certain constructs such as anchors may behave differently |
michael@0 | 809 | * at or around the boundaries of the region |
michael@0 | 810 | * |
michael@0 | 811 | * The function will fail if start is greater than limit, or if either index |
michael@0 | 812 | * is less than zero or greater than the length of the string being matched. |
michael@0 | 813 | * |
michael@0 | 814 | * @param regexp The compiled regular expression. |
michael@0 | 815 | * @param regionStart The (native) index to begin searches at. |
michael@0 | 816 | * @param regionLimit The (native) index to end searches at (exclusive). |
michael@0 | 817 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 818 | * @stable ICU 4.0 |
michael@0 | 819 | */ |
michael@0 | 820 | U_STABLE void U_EXPORT2 |
michael@0 | 821 | uregex_setRegion(URegularExpression *regexp, |
michael@0 | 822 | int32_t regionStart, |
michael@0 | 823 | int32_t regionLimit, |
michael@0 | 824 | UErrorCode *status); |
michael@0 | 825 | |
michael@0 | 826 | /** |
michael@0 | 827 | * 64bit version of uregex_setRegion. |
michael@0 | 828 | * Sets the limits of the matching region for this URegularExpression. |
michael@0 | 829 | * The region is the part of the input string that will be considered when matching. |
michael@0 | 830 | * Invoking this method resets any saved state from the previous match, |
michael@0 | 831 | * then sets the region to start at the index specified by the start parameter |
michael@0 | 832 | * and end at the index specified by the end parameter. |
michael@0 | 833 | * |
michael@0 | 834 | * Depending on the transparency and anchoring being used (see useTransparentBounds |
michael@0 | 835 | * and useAnchoringBounds), certain constructs such as anchors may behave differently |
michael@0 | 836 | * at or around the boundaries of the region |
michael@0 | 837 | * |
michael@0 | 838 | * The function will fail if start is greater than limit, or if either index |
michael@0 | 839 | * is less than zero or greater than the length of the string being matched. |
michael@0 | 840 | * |
michael@0 | 841 | * @param regexp The compiled regular expression. |
michael@0 | 842 | * @param regionStart The (native) index to begin searches at. |
michael@0 | 843 | * @param regionLimit The (native) index to end searches at (exclusive). |
michael@0 | 844 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 845 | * @stable ICU 4.6 |
michael@0 | 846 | */ |
michael@0 | 847 | U_STABLE void U_EXPORT2 |
michael@0 | 848 | uregex_setRegion64(URegularExpression *regexp, |
michael@0 | 849 | int64_t regionStart, |
michael@0 | 850 | int64_t regionLimit, |
michael@0 | 851 | UErrorCode *status); |
michael@0 | 852 | |
michael@0 | 853 | /** |
michael@0 | 854 | * Set the matching region and the starting index for subsequent matches |
michael@0 | 855 | * in a single operation. |
michael@0 | 856 | * This is useful because the usual function for setting the starting |
michael@0 | 857 | * index, urgex_reset(), also resets any region limits. |
michael@0 | 858 | * |
michael@0 | 859 | * @param regexp The compiled regular expression. |
michael@0 | 860 | * @param regionStart The (native) index to begin searches at. |
michael@0 | 861 | * @param regionLimit The (native) index to end searches at (exclusive). |
michael@0 | 862 | * @param startIndex The index in the input text at which the next |
michael@0 | 863 | * match operation should begin. |
michael@0 | 864 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 865 | * @stable ICU 4.6 |
michael@0 | 866 | */ |
michael@0 | 867 | U_STABLE void U_EXPORT2 |
michael@0 | 868 | uregex_setRegionAndStart(URegularExpression *regexp, |
michael@0 | 869 | int64_t regionStart, |
michael@0 | 870 | int64_t regionLimit, |
michael@0 | 871 | int64_t startIndex, |
michael@0 | 872 | UErrorCode *status); |
michael@0 | 873 | |
michael@0 | 874 | /** |
michael@0 | 875 | * Reports the start index of the matching region. Any matches found are limited to |
michael@0 | 876 | * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). |
michael@0 | 877 | * |
michael@0 | 878 | * @param regexp The compiled regular expression. |
michael@0 | 879 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 880 | * @return The starting (native) index of this matcher's region. |
michael@0 | 881 | * @stable ICU 4.0 |
michael@0 | 882 | */ |
michael@0 | 883 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 884 | uregex_regionStart(const URegularExpression *regexp, |
michael@0 | 885 | UErrorCode *status); |
michael@0 | 886 | |
michael@0 | 887 | /** |
michael@0 | 888 | * 64bit version of uregex_regionStart. |
michael@0 | 889 | * Reports the start index of the matching region. Any matches found are limited to |
michael@0 | 890 | * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). |
michael@0 | 891 | * |
michael@0 | 892 | * @param regexp The compiled regular expression. |
michael@0 | 893 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 894 | * @return The starting (native) index of this matcher's region. |
michael@0 | 895 | * @stable ICU 4.6 |
michael@0 | 896 | */ |
michael@0 | 897 | U_STABLE int64_t U_EXPORT2 |
michael@0 | 898 | uregex_regionStart64(const URegularExpression *regexp, |
michael@0 | 899 | UErrorCode *status); |
michael@0 | 900 | |
michael@0 | 901 | /** |
michael@0 | 902 | * Reports the end index (exclusive) of the matching region for this URegularExpression. |
michael@0 | 903 | * Any matches found are limited to to the region bounded by regionStart (inclusive) |
michael@0 | 904 | * and regionEnd (exclusive). |
michael@0 | 905 | * |
michael@0 | 906 | * @param regexp The compiled regular expression. |
michael@0 | 907 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 908 | * @return The ending point (native) of this matcher's region. |
michael@0 | 909 | * @stable ICU 4.0 |
michael@0 | 910 | */ |
michael@0 | 911 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 912 | uregex_regionEnd(const URegularExpression *regexp, |
michael@0 | 913 | UErrorCode *status); |
michael@0 | 914 | |
michael@0 | 915 | /** |
michael@0 | 916 | * 64bit version of uregex_regionEnd. |
michael@0 | 917 | * Reports the end index (exclusive) of the matching region for this URegularExpression. |
michael@0 | 918 | * Any matches found are limited to to the region bounded by regionStart (inclusive) |
michael@0 | 919 | * and regionEnd (exclusive). |
michael@0 | 920 | * |
michael@0 | 921 | * @param regexp The compiled regular expression. |
michael@0 | 922 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 923 | * @return The ending point (native) of this matcher's region. |
michael@0 | 924 | * @stable ICU 4.6 |
michael@0 | 925 | */ |
michael@0 | 926 | U_STABLE int64_t U_EXPORT2 |
michael@0 | 927 | uregex_regionEnd64(const URegularExpression *regexp, |
michael@0 | 928 | UErrorCode *status); |
michael@0 | 929 | |
michael@0 | 930 | /** |
michael@0 | 931 | * Queries the transparency of region bounds for this URegularExpression. |
michael@0 | 932 | * See useTransparentBounds for a description of transparent and opaque bounds. |
michael@0 | 933 | * By default, matching boundaries are opaque. |
michael@0 | 934 | * |
michael@0 | 935 | * @param regexp The compiled regular expression. |
michael@0 | 936 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 937 | * @return TRUE if this matcher is using opaque bounds, false if it is not. |
michael@0 | 938 | * @stable ICU 4.0 |
michael@0 | 939 | */ |
michael@0 | 940 | U_STABLE UBool U_EXPORT2 |
michael@0 | 941 | uregex_hasTransparentBounds(const URegularExpression *regexp, |
michael@0 | 942 | UErrorCode *status); |
michael@0 | 943 | |
michael@0 | 944 | |
michael@0 | 945 | /** |
michael@0 | 946 | * Sets the transparency of region bounds for this URegularExpression. |
michael@0 | 947 | * Invoking this function with an argument of TRUE will set matches to use transparent bounds. |
michael@0 | 948 | * If the boolean argument is FALSE, then opaque bounds will be used. |
michael@0 | 949 | * |
michael@0 | 950 | * Using transparent bounds, the boundaries of the matching region are transparent |
michael@0 | 951 | * to lookahead, lookbehind, and boundary matching constructs. Those constructs can |
michael@0 | 952 | * see text beyond the boundaries of the region while checking for a match. |
michael@0 | 953 | * |
michael@0 | 954 | * With opaque bounds, no text outside of the matching region is visible to lookahead, |
michael@0 | 955 | * lookbehind, and boundary matching constructs. |
michael@0 | 956 | * |
michael@0 | 957 | * By default, opaque bounds are used. |
michael@0 | 958 | * |
michael@0 | 959 | * @param regexp The compiled regular expression. |
michael@0 | 960 | * @param b TRUE for transparent bounds; FALSE for opaque bounds |
michael@0 | 961 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 962 | * @stable ICU 4.0 |
michael@0 | 963 | **/ |
michael@0 | 964 | U_STABLE void U_EXPORT2 |
michael@0 | 965 | uregex_useTransparentBounds(URegularExpression *regexp, |
michael@0 | 966 | UBool b, |
michael@0 | 967 | UErrorCode *status); |
michael@0 | 968 | |
michael@0 | 969 | |
michael@0 | 970 | /** |
michael@0 | 971 | * Return true if this URegularExpression is using anchoring bounds. |
michael@0 | 972 | * By default, anchoring region bounds are used. |
michael@0 | 973 | * |
michael@0 | 974 | * @param regexp The compiled regular expression. |
michael@0 | 975 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 976 | * @return TRUE if this matcher is using anchoring bounds. |
michael@0 | 977 | * @stable ICU 4.0 |
michael@0 | 978 | */ |
michael@0 | 979 | U_STABLE UBool U_EXPORT2 |
michael@0 | 980 | uregex_hasAnchoringBounds(const URegularExpression *regexp, |
michael@0 | 981 | UErrorCode *status); |
michael@0 | 982 | |
michael@0 | 983 | |
michael@0 | 984 | /** |
michael@0 | 985 | * Set whether this URegularExpression is using Anchoring Bounds for its region. |
michael@0 | 986 | * With anchoring bounds, pattern anchors such as ^ and $ will match at the start |
michael@0 | 987 | * and end of the region. Without Anchoring Bounds, anchors will only match at |
michael@0 | 988 | * the positions they would in the complete text. |
michael@0 | 989 | * |
michael@0 | 990 | * Anchoring Bounds are the default for regions. |
michael@0 | 991 | * |
michael@0 | 992 | * @param regexp The compiled regular expression. |
michael@0 | 993 | * @param b TRUE if to enable anchoring bounds; FALSE to disable them. |
michael@0 | 994 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 995 | * @stable ICU 4.0 |
michael@0 | 996 | */ |
michael@0 | 997 | U_STABLE void U_EXPORT2 |
michael@0 | 998 | uregex_useAnchoringBounds(URegularExpression *regexp, |
michael@0 | 999 | UBool b, |
michael@0 | 1000 | UErrorCode *status); |
michael@0 | 1001 | |
michael@0 | 1002 | /** |
michael@0 | 1003 | * Return TRUE if the most recent matching operation touched the |
michael@0 | 1004 | * end of the text being processed. In this case, additional input text could |
michael@0 | 1005 | * change the results of that match. |
michael@0 | 1006 | * |
michael@0 | 1007 | * @param regexp The compiled regular expression. |
michael@0 | 1008 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 1009 | * @return TRUE if the most recent match hit the end of input |
michael@0 | 1010 | * @stable ICU 4.0 |
michael@0 | 1011 | */ |
michael@0 | 1012 | U_STABLE UBool U_EXPORT2 |
michael@0 | 1013 | uregex_hitEnd(const URegularExpression *regexp, |
michael@0 | 1014 | UErrorCode *status); |
michael@0 | 1015 | |
michael@0 | 1016 | /** |
michael@0 | 1017 | * Return TRUE the most recent match succeeded and additional input could cause |
michael@0 | 1018 | * it to fail. If this function returns false and a match was found, then more input |
michael@0 | 1019 | * might change the match but the match won't be lost. If a match was not found, |
michael@0 | 1020 | * then requireEnd has no meaning. |
michael@0 | 1021 | * |
michael@0 | 1022 | * @param regexp The compiled regular expression. |
michael@0 | 1023 | * @param status A pointer to a UErrorCode to receive any errors. |
michael@0 | 1024 | * @return TRUE if more input could cause the most recent match to no longer match. |
michael@0 | 1025 | * @stable ICU 4.0 |
michael@0 | 1026 | */ |
michael@0 | 1027 | U_STABLE UBool U_EXPORT2 |
michael@0 | 1028 | uregex_requireEnd(const URegularExpression *regexp, |
michael@0 | 1029 | UErrorCode *status); |
michael@0 | 1030 | |
michael@0 | 1031 | |
michael@0 | 1032 | |
michael@0 | 1033 | |
michael@0 | 1034 | |
michael@0 | 1035 | /** |
michael@0 | 1036 | * Replaces every substring of the input that matches the pattern |
michael@0 | 1037 | * with the given replacement string. This is a convenience function that |
michael@0 | 1038 | * provides a complete find-and-replace-all operation. |
michael@0 | 1039 | * |
michael@0 | 1040 | * This method scans the input string looking for matches of the pattern. |
michael@0 | 1041 | * Input that is not part of any match is copied unchanged to the |
michael@0 | 1042 | * destination buffer. Matched regions are replaced in the output |
michael@0 | 1043 | * buffer by the replacement string. The replacement string may contain |
michael@0 | 1044 | * references to capture groups; these take the form of $1, $2, etc. |
michael@0 | 1045 | * |
michael@0 | 1046 | * @param regexp The compiled regular expression. |
michael@0 | 1047 | * @param replacementText A string containing the replacement text. |
michael@0 | 1048 | * @param replacementLength The length of the replacement string, or |
michael@0 | 1049 | * -1 if it is NUL terminated. |
michael@0 | 1050 | * @param destBuf A (UChar *) buffer that will receive the result. |
michael@0 | 1051 | * @param destCapacity The capacity of the destination buffer. |
michael@0 | 1052 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1053 | * @return The length of the string resulting from the find |
michael@0 | 1054 | * and replace operation. In the event that the |
michael@0 | 1055 | * destination capacity is inadequate, the return value |
michael@0 | 1056 | * is still the full length of the untruncated string. |
michael@0 | 1057 | * @stable ICU 3.0 |
michael@0 | 1058 | */ |
michael@0 | 1059 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1060 | uregex_replaceAll(URegularExpression *regexp, |
michael@0 | 1061 | const UChar *replacementText, |
michael@0 | 1062 | int32_t replacementLength, |
michael@0 | 1063 | UChar *destBuf, |
michael@0 | 1064 | int32_t destCapacity, |
michael@0 | 1065 | UErrorCode *status); |
michael@0 | 1066 | |
michael@0 | 1067 | /** |
michael@0 | 1068 | * Replaces every substring of the input that matches the pattern |
michael@0 | 1069 | * with the given replacement string. This is a convenience function that |
michael@0 | 1070 | * provides a complete find-and-replace-all operation. |
michael@0 | 1071 | * |
michael@0 | 1072 | * This method scans the input string looking for matches of the pattern. |
michael@0 | 1073 | * Input that is not part of any match is copied unchanged to the |
michael@0 | 1074 | * destination buffer. Matched regions are replaced in the output |
michael@0 | 1075 | * buffer by the replacement string. The replacement string may contain |
michael@0 | 1076 | * references to capture groups; these take the form of $1, $2, etc. |
michael@0 | 1077 | * |
michael@0 | 1078 | * @param regexp The compiled regular expression. |
michael@0 | 1079 | * @param replacement A string containing the replacement text. |
michael@0 | 1080 | * @param dest A mutable UText that will receive the result. |
michael@0 | 1081 | * If NULL, a new UText will be created (which may not be mutable). |
michael@0 | 1082 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1083 | * @return A UText containing the results of the find and replace. |
michael@0 | 1084 | * If a pre-allocated UText was provided, it will always be used and returned. |
michael@0 | 1085 | * |
michael@0 | 1086 | * @stable ICU 4.6 |
michael@0 | 1087 | */ |
michael@0 | 1088 | U_STABLE UText * U_EXPORT2 |
michael@0 | 1089 | uregex_replaceAllUText(URegularExpression *regexp, |
michael@0 | 1090 | UText *replacement, |
michael@0 | 1091 | UText *dest, |
michael@0 | 1092 | UErrorCode *status); |
michael@0 | 1093 | |
michael@0 | 1094 | /** |
michael@0 | 1095 | * Replaces the first substring of the input that matches the pattern |
michael@0 | 1096 | * with the given replacement string. This is a convenience function that |
michael@0 | 1097 | * provides a complete find-and-replace operation. |
michael@0 | 1098 | * |
michael@0 | 1099 | * This method scans the input string looking for a match of the pattern. |
michael@0 | 1100 | * All input that is not part of the match is copied unchanged to the |
michael@0 | 1101 | * destination buffer. The matched region is replaced in the output |
michael@0 | 1102 | * buffer by the replacement string. The replacement string may contain |
michael@0 | 1103 | * references to capture groups; these take the form of $1, $2, etc. |
michael@0 | 1104 | * |
michael@0 | 1105 | * @param regexp The compiled regular expression. |
michael@0 | 1106 | * @param replacementText A string containing the replacement text. |
michael@0 | 1107 | * @param replacementLength The length of the replacement string, or |
michael@0 | 1108 | * -1 if it is NUL terminated. |
michael@0 | 1109 | * @param destBuf A (UChar *) buffer that will receive the result. |
michael@0 | 1110 | * @param destCapacity The capacity of the destination buffer. |
michael@0 | 1111 | * @param status a reference to a UErrorCode to receive any errors. |
michael@0 | 1112 | * @return The length of the string resulting from the find |
michael@0 | 1113 | * and replace operation. In the event that the |
michael@0 | 1114 | * destination capacity is inadequate, the return value |
michael@0 | 1115 | * is still the full length of the untruncated string. |
michael@0 | 1116 | * @stable ICU 3.0 |
michael@0 | 1117 | */ |
michael@0 | 1118 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1119 | uregex_replaceFirst(URegularExpression *regexp, |
michael@0 | 1120 | const UChar *replacementText, |
michael@0 | 1121 | int32_t replacementLength, |
michael@0 | 1122 | UChar *destBuf, |
michael@0 | 1123 | int32_t destCapacity, |
michael@0 | 1124 | UErrorCode *status); |
michael@0 | 1125 | |
michael@0 | 1126 | /** |
michael@0 | 1127 | * Replaces the first substring of the input that matches the pattern |
michael@0 | 1128 | * with the given replacement string. This is a convenience function that |
michael@0 | 1129 | * provides a complete find-and-replace operation. |
michael@0 | 1130 | * |
michael@0 | 1131 | * This method scans the input string looking for a match of the pattern. |
michael@0 | 1132 | * All input that is not part of the match is copied unchanged to the |
michael@0 | 1133 | * destination buffer. The matched region is replaced in the output |
michael@0 | 1134 | * buffer by the replacement string. The replacement string may contain |
michael@0 | 1135 | * references to capture groups; these take the form of $1, $2, etc. |
michael@0 | 1136 | * |
michael@0 | 1137 | * @param regexp The compiled regular expression. |
michael@0 | 1138 | * @param replacement A string containing the replacement text. |
michael@0 | 1139 | * @param dest A mutable UText that will receive the result. |
michael@0 | 1140 | * If NULL, a new UText will be created (which may not be mutable). |
michael@0 | 1141 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1142 | * @return A UText containing the results of the find and replace. |
michael@0 | 1143 | * If a pre-allocated UText was provided, it will always be used and returned. |
michael@0 | 1144 | * |
michael@0 | 1145 | * @stable ICU 4.6 |
michael@0 | 1146 | */ |
michael@0 | 1147 | U_STABLE UText * U_EXPORT2 |
michael@0 | 1148 | uregex_replaceFirstUText(URegularExpression *regexp, |
michael@0 | 1149 | UText *replacement, |
michael@0 | 1150 | UText *dest, |
michael@0 | 1151 | UErrorCode *status); |
michael@0 | 1152 | |
michael@0 | 1153 | /** |
michael@0 | 1154 | * Implements a replace operation intended to be used as part of an |
michael@0 | 1155 | * incremental find-and-replace. |
michael@0 | 1156 | * |
michael@0 | 1157 | * <p>The input string, starting from the end of the previous match and ending at |
michael@0 | 1158 | * the start of the current match, is appended to the destination string. Then the |
michael@0 | 1159 | * replacement string is appended to the output string, |
michael@0 | 1160 | * including handling any substitutions of captured text.</p> |
michael@0 | 1161 | * |
michael@0 | 1162 | * <p>A note on preflight computation of buffersize and error handling: |
michael@0 | 1163 | * Calls to uregex_appendReplacement() and uregex_appendTail() are |
michael@0 | 1164 | * designed to be chained, one after another, with the destination |
michael@0 | 1165 | * buffer pointer and buffer capacity updated after each in preparation |
michael@0 | 1166 | * to for the next. If the destination buffer is exhausted partway through such a |
michael@0 | 1167 | * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal |
michael@0 | 1168 | * ICU conventions are for a function to perform no action if it is |
michael@0 | 1169 | * called with an error status, but for this one case, uregex_appendRepacement() |
michael@0 | 1170 | * will operate normally so that buffer size computations will complete |
michael@0 | 1171 | * correctly. |
michael@0 | 1172 | * |
michael@0 | 1173 | * <p>For simple, prepackaged, non-incremental find-and-replace |
michael@0 | 1174 | * operations, see replaceFirst() or replaceAll().</p> |
michael@0 | 1175 | * |
michael@0 | 1176 | * @param regexp The regular expression object. |
michael@0 | 1177 | * @param replacementText The string that will replace the matched portion of the |
michael@0 | 1178 | * input string as it is copied to the destination buffer. |
michael@0 | 1179 | * The replacement text may contain references ($1, for |
michael@0 | 1180 | * example) to capture groups from the match. |
michael@0 | 1181 | * @param replacementLength The length of the replacement text string, |
michael@0 | 1182 | * or -1 if the string is NUL terminated. |
michael@0 | 1183 | * @param destBuf The buffer into which the results of the |
michael@0 | 1184 | * find-and-replace are placed. On return, this pointer |
michael@0 | 1185 | * will be updated to refer to the beginning of the |
michael@0 | 1186 | * unused portion of buffer, leaving it in position for |
michael@0 | 1187 | * a subsequent call to this function. |
michael@0 | 1188 | * @param destCapacity The size of the output buffer, On return, this |
michael@0 | 1189 | * parameter will be updated to reflect the space remaining |
michael@0 | 1190 | * unused in the output buffer. |
michael@0 | 1191 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1192 | * @return The length of the result string. In the event that |
michael@0 | 1193 | * destCapacity is inadequate, the full length of the |
michael@0 | 1194 | * untruncated output string is returned. |
michael@0 | 1195 | * |
michael@0 | 1196 | * @stable ICU 3.0 |
michael@0 | 1197 | * |
michael@0 | 1198 | */ |
michael@0 | 1199 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1200 | uregex_appendReplacement(URegularExpression *regexp, |
michael@0 | 1201 | const UChar *replacementText, |
michael@0 | 1202 | int32_t replacementLength, |
michael@0 | 1203 | UChar **destBuf, |
michael@0 | 1204 | int32_t *destCapacity, |
michael@0 | 1205 | UErrorCode *status); |
michael@0 | 1206 | |
michael@0 | 1207 | /** |
michael@0 | 1208 | * Implements a replace operation intended to be used as part of an |
michael@0 | 1209 | * incremental find-and-replace. |
michael@0 | 1210 | * |
michael@0 | 1211 | * <p>The input string, starting from the end of the previous match and ending at |
michael@0 | 1212 | * the start of the current match, is appended to the destination string. Then the |
michael@0 | 1213 | * replacement string is appended to the output string, |
michael@0 | 1214 | * including handling any substitutions of captured text.</p> |
michael@0 | 1215 | * |
michael@0 | 1216 | * <p>For simple, prepackaged, non-incremental find-and-replace |
michael@0 | 1217 | * operations, see replaceFirst() or replaceAll().</p> |
michael@0 | 1218 | * |
michael@0 | 1219 | * @param regexp The regular expression object. |
michael@0 | 1220 | * @param replacementText The string that will replace the matched portion of the |
michael@0 | 1221 | * input string as it is copied to the destination buffer. |
michael@0 | 1222 | * The replacement text may contain references ($1, for |
michael@0 | 1223 | * example) to capture groups from the match. |
michael@0 | 1224 | * @param dest A mutable UText that will receive the result. Must not be NULL. |
michael@0 | 1225 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1226 | * |
michael@0 | 1227 | * @stable ICU 4.6 |
michael@0 | 1228 | */ |
michael@0 | 1229 | U_STABLE void U_EXPORT2 |
michael@0 | 1230 | uregex_appendReplacementUText(URegularExpression *regexp, |
michael@0 | 1231 | UText *replacementText, |
michael@0 | 1232 | UText *dest, |
michael@0 | 1233 | UErrorCode *status); |
michael@0 | 1234 | |
michael@0 | 1235 | /** |
michael@0 | 1236 | * As the final step in a find-and-replace operation, append the remainder |
michael@0 | 1237 | * of the input string, starting at the position following the last match, |
michael@0 | 1238 | * to the destination string. <code>uregex_appendTail()</code> is intended |
michael@0 | 1239 | * to be invoked after one or more invocations of the |
michael@0 | 1240 | * <code>uregex_appendReplacement()</code> function. |
michael@0 | 1241 | * |
michael@0 | 1242 | * @param regexp The regular expression object. This is needed to |
michael@0 | 1243 | * obtain the input string and with the position |
michael@0 | 1244 | * of the last match within it. |
michael@0 | 1245 | * @param destBuf The buffer in which the results of the |
michael@0 | 1246 | * find-and-replace are placed. On return, the pointer |
michael@0 | 1247 | * will be updated to refer to the beginning of the |
michael@0 | 1248 | * unused portion of buffer. |
michael@0 | 1249 | * @param destCapacity The size of the output buffer, On return, this |
michael@0 | 1250 | * value will be updated to reflect the space remaining |
michael@0 | 1251 | * unused in the output buffer. |
michael@0 | 1252 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1253 | * @return The length of the result string. In the event that |
michael@0 | 1254 | * destCapacity is inadequate, the full length of the |
michael@0 | 1255 | * untruncated output string is returned. |
michael@0 | 1256 | * |
michael@0 | 1257 | * @stable ICU 3.0 |
michael@0 | 1258 | */ |
michael@0 | 1259 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1260 | uregex_appendTail(URegularExpression *regexp, |
michael@0 | 1261 | UChar **destBuf, |
michael@0 | 1262 | int32_t *destCapacity, |
michael@0 | 1263 | UErrorCode *status); |
michael@0 | 1264 | |
michael@0 | 1265 | /** |
michael@0 | 1266 | * As the final step in a find-and-replace operation, append the remainder |
michael@0 | 1267 | * of the input string, starting at the position following the last match, |
michael@0 | 1268 | * to the destination string. <code>uregex_appendTailUText()</code> is intended |
michael@0 | 1269 | * to be invoked after one or more invocations of the |
michael@0 | 1270 | * <code>uregex_appendReplacementUText()</code> function. |
michael@0 | 1271 | * |
michael@0 | 1272 | * @param regexp The regular expression object. This is needed to |
michael@0 | 1273 | * obtain the input string and with the position |
michael@0 | 1274 | * of the last match within it. |
michael@0 | 1275 | * @param dest A mutable UText that will receive the result. Must not be NULL. |
michael@0 | 1276 | * |
michael@0 | 1277 | * @param status Error code |
michael@0 | 1278 | * |
michael@0 | 1279 | * @return The destination UText. |
michael@0 | 1280 | * |
michael@0 | 1281 | * @stable ICU 4.6 |
michael@0 | 1282 | */ |
michael@0 | 1283 | U_STABLE UText * U_EXPORT2 |
michael@0 | 1284 | uregex_appendTailUText(URegularExpression *regexp, |
michael@0 | 1285 | UText *dest, |
michael@0 | 1286 | UErrorCode *status); |
michael@0 | 1287 | |
michael@0 | 1288 | /** |
michael@0 | 1289 | * Split a string into fields. Somewhat like split() from Perl. |
michael@0 | 1290 | * The pattern matches identify delimiters that separate the input |
michael@0 | 1291 | * into fields. The input data between the matches becomes the |
michael@0 | 1292 | * fields themselves. |
michael@0 | 1293 | * |
michael@0 | 1294 | * Each of the fields is copied from the input string to the destination |
michael@0 | 1295 | * buffer, and NUL terminated. The position of each field within |
michael@0 | 1296 | * the destination buffer is returned in the destFields array. |
michael@0 | 1297 | * |
michael@0 | 1298 | * If the delimiter pattern includes capture groups, the captured text will |
michael@0 | 1299 | * also appear in the destination array of output strings, interspersed |
michael@0 | 1300 | * with the fields. This is similar to Perl, but differs from Java, |
michael@0 | 1301 | * which ignores the presence of capture groups in the pattern. |
michael@0 | 1302 | * |
michael@0 | 1303 | * Trailing empty fields will always be returned, assuming sufficient |
michael@0 | 1304 | * destination capacity. This differs from the default behavior for Java |
michael@0 | 1305 | * and Perl where trailing empty fields are not returned. |
michael@0 | 1306 | * |
michael@0 | 1307 | * The number of strings produced by the split operation is returned. |
michael@0 | 1308 | * This count includes the strings from capture groups in the delimiter pattern. |
michael@0 | 1309 | * This behavior differs from Java, which ignores capture groups. |
michael@0 | 1310 | * |
michael@0 | 1311 | * @param regexp The compiled regular expression. |
michael@0 | 1312 | * @param destBuf A (UChar *) buffer to receive the fields that |
michael@0 | 1313 | * are extracted from the input string. These |
michael@0 | 1314 | * field pointers will refer to positions within the |
michael@0 | 1315 | * destination buffer supplied by the caller. Any |
michael@0 | 1316 | * extra positions within the destFields array will be |
michael@0 | 1317 | * set to NULL. |
michael@0 | 1318 | * @param destCapacity The capacity of the destBuf. |
michael@0 | 1319 | * @param requiredCapacity The actual capacity required of the destBuf. |
michael@0 | 1320 | * If destCapacity is too small, requiredCapacity will return |
michael@0 | 1321 | * the total capacity required to hold all of the output, and |
michael@0 | 1322 | * a U_BUFFER_OVERFLOW_ERROR will be returned. |
michael@0 | 1323 | * @param destFields An array to be filled with the position of each |
michael@0 | 1324 | * of the extracted fields within destBuf. |
michael@0 | 1325 | * @param destFieldsCapacity The number of elements in the destFields array. |
michael@0 | 1326 | * If the number of fields found is less than destFieldsCapacity, |
michael@0 | 1327 | * the extra destFields elements are set to zero. |
michael@0 | 1328 | * If destFieldsCapacity is too small, the trailing part of the |
michael@0 | 1329 | * input, including any field delimiters, is treated as if it |
michael@0 | 1330 | * were the last field - it is copied to the destBuf, and |
michael@0 | 1331 | * its position is in the destBuf is stored in the last element |
michael@0 | 1332 | * of destFields. This behavior mimics that of Perl. It is not |
michael@0 | 1333 | * an error condition, and no error status is returned when all destField |
michael@0 | 1334 | * positions are used. |
michael@0 | 1335 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1336 | * @return The number of fields into which the input string was split. |
michael@0 | 1337 | * @stable ICU 3.0 |
michael@0 | 1338 | */ |
michael@0 | 1339 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1340 | uregex_split( URegularExpression *regexp, |
michael@0 | 1341 | UChar *destBuf, |
michael@0 | 1342 | int32_t destCapacity, |
michael@0 | 1343 | int32_t *requiredCapacity, |
michael@0 | 1344 | UChar *destFields[], |
michael@0 | 1345 | int32_t destFieldsCapacity, |
michael@0 | 1346 | UErrorCode *status); |
michael@0 | 1347 | |
michael@0 | 1348 | /** |
michael@0 | 1349 | * Split a string into fields. Somewhat like split() from Perl. |
michael@0 | 1350 | * The pattern matches identify delimiters that separate the input |
michael@0 | 1351 | * into fields. The input data between the matches becomes the |
michael@0 | 1352 | * fields themselves. |
michael@0 | 1353 | * <p> |
michael@0 | 1354 | * The behavior of this function is not very closely aligned with uregex_split(); |
michael@0 | 1355 | * instead, it is based on (and implemented directly on top of) the C++ split method. |
michael@0 | 1356 | * |
michael@0 | 1357 | * @param regexp The compiled regular expression. |
michael@0 | 1358 | * @param destFields An array of mutable UText structs to receive the results of the split. |
michael@0 | 1359 | * If a field is NULL, a new UText is allocated to contain the results for |
michael@0 | 1360 | * that field. This new UText is not guaranteed to be mutable. |
michael@0 | 1361 | * @param destFieldsCapacity The number of elements in the destination array. |
michael@0 | 1362 | * If the number of fields found is less than destCapacity, the |
michael@0 | 1363 | * extra strings in the destination array are not altered. |
michael@0 | 1364 | * If the number of destination strings is less than the number |
michael@0 | 1365 | * of fields, the trailing part of the input string, including any |
michael@0 | 1366 | * field delimiters, is placed in the last destination string. |
michael@0 | 1367 | * This behavior mimics that of Perl. It is not an error condition, and no |
michael@0 | 1368 | * error status is returned when all destField positions are used. |
michael@0 | 1369 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1370 | * @return The number of fields into which the input string was split. |
michael@0 | 1371 | * |
michael@0 | 1372 | * @stable ICU 4.6 |
michael@0 | 1373 | */ |
michael@0 | 1374 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1375 | uregex_splitUText(URegularExpression *regexp, |
michael@0 | 1376 | UText *destFields[], |
michael@0 | 1377 | int32_t destFieldsCapacity, |
michael@0 | 1378 | UErrorCode *status); |
michael@0 | 1379 | |
michael@0 | 1380 | /** |
michael@0 | 1381 | * Set a processing time limit for match operations with this URegularExpression. |
michael@0 | 1382 | * |
michael@0 | 1383 | * Some patterns, when matching certain strings, can run in exponential time. |
michael@0 | 1384 | * For practical purposes, the match operation may appear to be in an |
michael@0 | 1385 | * infinite loop. |
michael@0 | 1386 | * When a limit is set a match operation will fail with an error if the |
michael@0 | 1387 | * limit is exceeded. |
michael@0 | 1388 | * <p> |
michael@0 | 1389 | * The units of the limit are steps of the match engine. |
michael@0 | 1390 | * Correspondence with actual processor time will depend on the speed |
michael@0 | 1391 | * of the processor and the details of the specific pattern, but will |
michael@0 | 1392 | * typically be on the order of milliseconds. |
michael@0 | 1393 | * <p> |
michael@0 | 1394 | * By default, the matching time is not limited. |
michael@0 | 1395 | * <p> |
michael@0 | 1396 | * |
michael@0 | 1397 | * @param regexp The compiled regular expression. |
michael@0 | 1398 | * @param limit The limit value, or 0 for no limit. |
michael@0 | 1399 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1400 | * @stable ICU 4.0 |
michael@0 | 1401 | */ |
michael@0 | 1402 | U_STABLE void U_EXPORT2 |
michael@0 | 1403 | uregex_setTimeLimit(URegularExpression *regexp, |
michael@0 | 1404 | int32_t limit, |
michael@0 | 1405 | UErrorCode *status); |
michael@0 | 1406 | |
michael@0 | 1407 | /** |
michael@0 | 1408 | * Get the time limit for for matches with this URegularExpression. |
michael@0 | 1409 | * A return value of zero indicates that there is no limit. |
michael@0 | 1410 | * |
michael@0 | 1411 | * @param regexp The compiled regular expression. |
michael@0 | 1412 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1413 | * @return the maximum allowed time for a match, in units of processing steps. |
michael@0 | 1414 | * @stable ICU 4.0 |
michael@0 | 1415 | */ |
michael@0 | 1416 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1417 | uregex_getTimeLimit(const URegularExpression *regexp, |
michael@0 | 1418 | UErrorCode *status); |
michael@0 | 1419 | |
michael@0 | 1420 | /** |
michael@0 | 1421 | * Set the amount of heap storage available for use by the match backtracking stack. |
michael@0 | 1422 | * <p> |
michael@0 | 1423 | * ICU uses a backtracking regular expression engine, with the backtrack stack |
michael@0 | 1424 | * maintained on the heap. This function sets the limit to the amount of memory |
michael@0 | 1425 | * that can be used for this purpose. A backtracking stack overflow will |
michael@0 | 1426 | * result in an error from the match operation that caused it. |
michael@0 | 1427 | * <p> |
michael@0 | 1428 | * A limit is desirable because a malicious or poorly designed pattern can use |
michael@0 | 1429 | * excessive memory, potentially crashing the process. A limit is enabled |
michael@0 | 1430 | * by default. |
michael@0 | 1431 | * <p> |
michael@0 | 1432 | * @param regexp The compiled regular expression. |
michael@0 | 1433 | * @param limit The maximum size, in bytes, of the matching backtrack stack. |
michael@0 | 1434 | * A value of zero means no limit. |
michael@0 | 1435 | * The limit must be greater than or equal to zero. |
michael@0 | 1436 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1437 | * |
michael@0 | 1438 | * @stable ICU 4.0 |
michael@0 | 1439 | */ |
michael@0 | 1440 | U_STABLE void U_EXPORT2 |
michael@0 | 1441 | uregex_setStackLimit(URegularExpression *regexp, |
michael@0 | 1442 | int32_t limit, |
michael@0 | 1443 | UErrorCode *status); |
michael@0 | 1444 | |
michael@0 | 1445 | /** |
michael@0 | 1446 | * Get the size of the heap storage available for use by the back tracking stack. |
michael@0 | 1447 | * |
michael@0 | 1448 | * @return the maximum backtracking stack size, in bytes, or zero if the |
michael@0 | 1449 | * stack size is unlimited. |
michael@0 | 1450 | * @stable ICU 4.0 |
michael@0 | 1451 | */ |
michael@0 | 1452 | U_STABLE int32_t U_EXPORT2 |
michael@0 | 1453 | uregex_getStackLimit(const URegularExpression *regexp, |
michael@0 | 1454 | UErrorCode *status); |
michael@0 | 1455 | |
michael@0 | 1456 | |
michael@0 | 1457 | /** |
michael@0 | 1458 | * Function pointer for a regular expression matching callback function. |
michael@0 | 1459 | * When set, a callback function will be called periodically during matching |
michael@0 | 1460 | * operations. If the call back function returns FALSE, the matching |
michael@0 | 1461 | * operation will be terminated early. |
michael@0 | 1462 | * |
michael@0 | 1463 | * Note: the callback function must not call other functions on this |
michael@0 | 1464 | * URegularExpression. |
michael@0 | 1465 | * |
michael@0 | 1466 | * @param context context pointer. The callback function will be invoked |
michael@0 | 1467 | * with the context specified at the time that |
michael@0 | 1468 | * uregex_setMatchCallback() is called. |
michael@0 | 1469 | * @param steps the accumulated processing time, in match steps, |
michael@0 | 1470 | * for this matching operation. |
michael@0 | 1471 | * @return TRUE to continue the matching operation. |
michael@0 | 1472 | * FALSE to terminate the matching operation. |
michael@0 | 1473 | * @stable ICU 4.0 |
michael@0 | 1474 | */ |
michael@0 | 1475 | U_CDECL_BEGIN |
michael@0 | 1476 | typedef UBool U_CALLCONV URegexMatchCallback ( |
michael@0 | 1477 | const void *context, |
michael@0 | 1478 | int32_t steps); |
michael@0 | 1479 | U_CDECL_END |
michael@0 | 1480 | |
michael@0 | 1481 | /** |
michael@0 | 1482 | * Set a callback function for this URegularExpression. |
michael@0 | 1483 | * During matching operations the function will be called periodically, |
michael@0 | 1484 | * giving the application the opportunity to terminate a long-running |
michael@0 | 1485 | * match. |
michael@0 | 1486 | * |
michael@0 | 1487 | * @param regexp The compiled regular expression. |
michael@0 | 1488 | * @param callback A pointer to the user-supplied callback function. |
michael@0 | 1489 | * @param context User context pointer. The value supplied at the |
michael@0 | 1490 | * time the callback function is set will be saved |
michael@0 | 1491 | * and passed to the callback each time that it is called. |
michael@0 | 1492 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1493 | * @stable ICU 4.0 |
michael@0 | 1494 | */ |
michael@0 | 1495 | U_STABLE void U_EXPORT2 |
michael@0 | 1496 | uregex_setMatchCallback(URegularExpression *regexp, |
michael@0 | 1497 | URegexMatchCallback *callback, |
michael@0 | 1498 | const void *context, |
michael@0 | 1499 | UErrorCode *status); |
michael@0 | 1500 | |
michael@0 | 1501 | |
michael@0 | 1502 | /** |
michael@0 | 1503 | * Get the callback function for this URegularExpression. |
michael@0 | 1504 | * |
michael@0 | 1505 | * @param regexp The compiled regular expression. |
michael@0 | 1506 | * @param callback Out parameter, receives a pointer to the user-supplied |
michael@0 | 1507 | * callback function. |
michael@0 | 1508 | * @param context Out parameter, receives the user context pointer that |
michael@0 | 1509 | * was set when uregex_setMatchCallback() was called. |
michael@0 | 1510 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1511 | * @stable ICU 4.0 |
michael@0 | 1512 | */ |
michael@0 | 1513 | U_STABLE void U_EXPORT2 |
michael@0 | 1514 | uregex_getMatchCallback(const URegularExpression *regexp, |
michael@0 | 1515 | URegexMatchCallback **callback, |
michael@0 | 1516 | const void **context, |
michael@0 | 1517 | UErrorCode *status); |
michael@0 | 1518 | |
michael@0 | 1519 | /** |
michael@0 | 1520 | * Function pointer for a regular expression find callback function. |
michael@0 | 1521 | * |
michael@0 | 1522 | * When set, a callback function will be called during a find operation |
michael@0 | 1523 | * and for operations that depend on find, such as findNext, split and some replace |
michael@0 | 1524 | * operations like replaceFirst. |
michael@0 | 1525 | * The callback will usually be called after each attempt at a match, but this is not a |
michael@0 | 1526 | * guarantee that the callback will be invoked at each character. For finds where the |
michael@0 | 1527 | * match engine is invoked at each character, this may be close to true, but less likely |
michael@0 | 1528 | * for more optimized loops where the pattern is known to only start, and the match |
michael@0 | 1529 | * engine invoked, at certain characters. |
michael@0 | 1530 | * When invoked, this callback will specify the index at which a match operation is about |
michael@0 | 1531 | * to be attempted, giving the application the opportunity to terminate a long-running |
michael@0 | 1532 | * find operation. |
michael@0 | 1533 | * |
michael@0 | 1534 | * If the call back function returns FALSE, the find operation will be terminated early. |
michael@0 | 1535 | * |
michael@0 | 1536 | * Note: the callback function must not call other functions on this |
michael@0 | 1537 | * URegularExpression |
michael@0 | 1538 | * |
michael@0 | 1539 | * @param context context pointer. The callback function will be invoked |
michael@0 | 1540 | * with the context specified at the time that |
michael@0 | 1541 | * uregex_setFindProgressCallback() is called. |
michael@0 | 1542 | * @param matchIndex the next index at which a match attempt will be attempted for this |
michael@0 | 1543 | * find operation. If this callback interrupts the search, this is the |
michael@0 | 1544 | * index at which a find/findNext operation may be re-initiated. |
michael@0 | 1545 | * @return TRUE to continue the matching operation. |
michael@0 | 1546 | * FALSE to terminate the matching operation. |
michael@0 | 1547 | * @stable ICU 4.6 |
michael@0 | 1548 | */ |
michael@0 | 1549 | U_CDECL_BEGIN |
michael@0 | 1550 | typedef UBool U_CALLCONV URegexFindProgressCallback ( |
michael@0 | 1551 | const void *context, |
michael@0 | 1552 | int64_t matchIndex); |
michael@0 | 1553 | U_CDECL_END |
michael@0 | 1554 | |
michael@0 | 1555 | |
michael@0 | 1556 | /** |
michael@0 | 1557 | * Set the find progress callback function for this URegularExpression. |
michael@0 | 1558 | * |
michael@0 | 1559 | * @param regexp The compiled regular expression. |
michael@0 | 1560 | * @param callback A pointer to the user-supplied callback function. |
michael@0 | 1561 | * @param context User context pointer. The value supplied at the |
michael@0 | 1562 | * time the callback function is set will be saved |
michael@0 | 1563 | * and passed to the callback each time that it is called. |
michael@0 | 1564 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1565 | * @stable ICU 4.6 |
michael@0 | 1566 | */ |
michael@0 | 1567 | U_STABLE void U_EXPORT2 |
michael@0 | 1568 | uregex_setFindProgressCallback(URegularExpression *regexp, |
michael@0 | 1569 | URegexFindProgressCallback *callback, |
michael@0 | 1570 | const void *context, |
michael@0 | 1571 | UErrorCode *status); |
michael@0 | 1572 | |
michael@0 | 1573 | /** |
michael@0 | 1574 | * Get the find progress callback function for this URegularExpression. |
michael@0 | 1575 | * |
michael@0 | 1576 | * @param regexp The compiled regular expression. |
michael@0 | 1577 | * @param callback Out parameter, receives a pointer to the user-supplied |
michael@0 | 1578 | * callback function. |
michael@0 | 1579 | * @param context Out parameter, receives the user context pointer that |
michael@0 | 1580 | * was set when uregex_setFindProgressCallback() was called. |
michael@0 | 1581 | * @param status A reference to a UErrorCode to receive any errors. |
michael@0 | 1582 | * @stable ICU 4.6 |
michael@0 | 1583 | */ |
michael@0 | 1584 | U_STABLE void U_EXPORT2 |
michael@0 | 1585 | uregex_getFindProgressCallback(const URegularExpression *regexp, |
michael@0 | 1586 | URegexFindProgressCallback **callback, |
michael@0 | 1587 | const void **context, |
michael@0 | 1588 | UErrorCode *status); |
michael@0 | 1589 | |
michael@0 | 1590 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
michael@0 | 1591 | #endif /* UREGEX_H */ |