intl/icu/source/i18n/unicode/uregex.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2004-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   file name:  uregex.h
     7 *   encoding:   US-ASCII
     8 *   indentation:4
     9 *
    10 *   created on: 2004mar09
    11 *   created by: Andy Heninger
    12 *
    13 *   ICU Regular Expressions, API for C
    14 */
    16 /**
    17  * \file
    18  * \brief C API: Regular Expressions
    19  *
    20  * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
    21  */
    23 #ifndef UREGEX_H
    24 #define UREGEX_H
    26 #include "unicode/utext.h"
    27 #include "unicode/utypes.h"
    29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    31 #include "unicode/localpointer.h"
    32 #include "unicode/parseerr.h"
    34 struct URegularExpression;
    35 /**
    36   * Structure representing a compiled regular expression, plus the results
    37   *    of a match operation.
    38   * @stable ICU 3.0
    39   */
    40 typedef struct URegularExpression URegularExpression;
    43 /**
    44  * Constants for Regular Expression Match Modes.
    45  * @stable ICU 2.4
    46  */
    47 typedef enum URegexpFlag{
    49 #ifndef U_HIDE_DRAFT_API 
    50     /** Forces normalization of pattern and strings. 
    51     Not implemented yet, just a placeholder, hence draft. 
    52     @draft ICU 2.4 */
    53     UREGEX_CANON_EQ         = 128,
    54 #endif /* U_HIDE_DRAFT_API */
    55     /**  Enable case insensitive matching.  @stable ICU 2.4 */
    56     UREGEX_CASE_INSENSITIVE = 2,
    58     /**  Allow white space and comments within patterns  @stable ICU 2.4 */
    59     UREGEX_COMMENTS         = 4,
    61     /**  If set, '.' matches line terminators,  otherwise '.' matching stops at line end.
    62       *  @stable ICU 2.4 */
    63     UREGEX_DOTALL           = 32,
    65     /**  If set, treat the entire pattern as a literal string.  
    66       *  Metacharacters or escape sequences in the input sequence will be given 
    67       *  no special meaning. 
    68       *
    69       *  The flag UREGEX_CASE_INSENSITIVE retains its impact
    70       *  on matching when used in conjunction with this flag.
    71       *  The other flags become superfluous.
    72       *
    73       * @stable ICU 4.0
    74       */
    75     UREGEX_LITERAL = 16,
    77     /**   Control behavior of "$" and "^"
    78       *    If set, recognize line terminators within string,
    79       *    otherwise, match only at start and end of input string.
    80       *   @stable ICU 2.4 */
    81     UREGEX_MULTILINE        = 8,
    83     /**   Unix-only line endings.
    84       *   When this mode is enabled, only \\u000a is recognized as a line ending
    85       *    in the behavior of ., ^, and $.
    86       *   @stable ICU 4.0
    87       */
    88     UREGEX_UNIX_LINES = 1,
    90     /**  Unicode word boundaries.
    91       *     If set, \b uses the Unicode TR 29 definition of word boundaries.
    92       *     Warning: Unicode word boundaries are quite different from
    93       *     traditional regular expression word boundaries.  See
    94       *     http://unicode.org/reports/tr29/#Word_Boundaries
    95       *     @stable ICU 2.8
    96       */
    97     UREGEX_UWORD            = 256,
    99      /**  Error on Unrecognized backslash escapes.
   100        *     If set, fail with an error on patterns that contain
   101        *     backslash-escaped ASCII letters without a known special
   102        *     meaning.  If this flag is not set, these
   103        *     escaped letters represent themselves.
   104        *     @stable ICU 4.0
   105        */
   106      UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
   108 }  URegexpFlag;
   110 /**
   111   *  Open (compile) an ICU regular expression.  Compiles the regular expression in
   112   *  string form into an internal representation using the specified match mode flags.
   113   *  The resulting regular expression handle can then be used to perform various
   114   *   matching operations.
   115   * 
   116   *
   117   * @param pattern        The Regular Expression pattern to be compiled. 
   118   * @param patternLength  The length of the pattern, or -1 if the pattern is
   119   *                       NUL terminated.
   120   * @param flags          Flags that alter the default matching behavior for
   121   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
   122   *                       example.  For default behavior, set this parameter to zero.
   123   *                       See <code>enum URegexpFlag</code>.  All desired flags
   124   *                       are bitwise-ORed together.
   125   * @param pe             Receives the position (line and column numbers) of any syntax
   126   *                       error within the source regular expression string.  If this
   127   *                       information is not wanted, pass NULL for this parameter.
   128   * @param status         Receives error detected by this function.
   129   * @stable ICU 3.0
   130   *
   131   */
   132 U_STABLE URegularExpression * U_EXPORT2
   133 uregex_open( const  UChar          *pattern,
   134                     int32_t         patternLength,
   135                     uint32_t        flags,
   136                     UParseError    *pe,
   137                     UErrorCode     *status);
   139 /**
   140   *  Open (compile) an ICU regular expression.  Compiles the regular expression in
   141   *  string form into an internal representation using the specified match mode flags.
   142   *  The resulting regular expression handle can then be used to perform various
   143   *   matching operations.
   144   *  <p>
   145   *  The contents of the pattern UText will be extracted and saved. Ownership of the
   146   *   UText struct itself remains with the caller. This is to match the behavior of
   147   *   uregex_open().
   148   *
   149   * @param pattern        The Regular Expression pattern to be compiled. 
   150   * @param flags          Flags that alter the default matching behavior for
   151   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
   152   *                       example.  For default behavior, set this parameter to zero.
   153   *                       See <code>enum URegexpFlag</code>.  All desired flags
   154   *                       are bitwise-ORed together.
   155   * @param pe             Receives the position (line and column numbers) of any syntax
   156   *                       error within the source regular expression string.  If this
   157   *                       information is not wanted, pass NULL for this parameter.
   158   * @param status         Receives error detected by this function.
   159   *
   160   * @stable ICU 4.6
   161   */
   162 U_STABLE URegularExpression *  U_EXPORT2
   163 uregex_openUText(UText          *pattern,
   164                  uint32_t        flags,
   165                  UParseError    *pe,
   166                  UErrorCode     *status);
   168 /**
   169   *  Open (compile) an ICU regular expression.  The resulting regular expression
   170   *   handle can then be used to perform various matching operations.
   171   *  <p>
   172   *   This function is the same as uregex_open, except that the pattern
   173   *   is supplied as an 8 bit char * string in the default code page.
   174   *
   175   * @param pattern        The Regular Expression pattern to be compiled, 
   176   *                       NUL terminated.  
   177   * @param flags          Flags that alter the default matching behavior for
   178   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
   179   *                       example.  For default behavior, set this parameter to zero.
   180   *                       See <code>enum URegexpFlag</code>.  All desired flags
   181   *                       are bitwise-ORed together.
   182   * @param pe             Receives the position (line and column numbers) of any syntax
   183   *                       error within the source regular expression string.  If this
   184   *                       information is not wanted, pass NULL for this parameter.
   185   * @param status         Receives errors detected by this function.
   186   * @return               The URegularExpression object representing the compiled
   187   *                       pattern.
   188   *
   189   * @stable ICU 3.0
   190   */
   191 #if !UCONFIG_NO_CONVERSION
   192 U_STABLE URegularExpression * U_EXPORT2
   193 uregex_openC( const char           *pattern,
   194                     uint32_t        flags,
   195                     UParseError    *pe,
   196                     UErrorCode     *status);
   197 #endif
   201 /**
   202   *  Close the regular expression, recovering all resources (memory) it
   203   *   was holding.
   204   *
   205   * @param regexp   The regular expression to be closed.
   206   * @stable ICU 3.0
   207   */
   208 U_STABLE void U_EXPORT2 
   209 uregex_close(URegularExpression *regexp);
   211 #if U_SHOW_CPLUSPLUS_API
   213 U_NAMESPACE_BEGIN
   215 /**
   216  * \class LocalURegularExpressionPointer
   217  * "Smart pointer" class, closes a URegularExpression via uregex_close().
   218  * For most methods see the LocalPointerBase base class.
   219  *
   220  * @see LocalPointerBase
   221  * @see LocalPointer
   222  * @stable ICU 4.4
   223  */
   224 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close);
   226 U_NAMESPACE_END
   228 #endif
   230 /**
   231  * Make a copy of a compiled regular expression.  Cloning a regular
   232  * expression is faster than opening a second instance from the source
   233  * form of the expression, and requires less memory.
   234  * <p>
   235  * Note that the current input string and the position of any matched text
   236  *  within it are not cloned; only the pattern itself and the
   237  *  match mode flags are copied.
   238  * <p>
   239  * Cloning can be particularly useful to threaded applications that perform
   240  * multiple match operations in parallel.  Each concurrent RE
   241  * operation requires its own instance of a URegularExpression.
   242  *
   243  * @param regexp   The compiled regular expression to be cloned.
   244  * @param status   Receives indication of any errors encountered
   245  * @return the cloned copy of the compiled regular expression.
   246  * @stable ICU 3.0
   247  */
   248 U_STABLE URegularExpression * U_EXPORT2 
   249 uregex_clone(const URegularExpression *regexp, UErrorCode *status);
   251 /**
   252  *  Returns a pointer to the source form of the pattern for this regular expression.
   253  *  This function will work even if the pattern was originally specified as a UText.
   254  *
   255  * @param regexp     The compiled regular expression.
   256  * @param patLength  This output parameter will be set to the length of the
   257  *                   pattern string.  A NULL pointer may be used here if the
   258  *                   pattern length is not needed, as would be the case if
   259  *                   the pattern is known in advance to be a NUL terminated
   260  *                   string.
   261  * @param status     Receives errors detected by this function.
   262  * @return a pointer to the pattern string.  The storage for the string is
   263  *                   owned by the regular expression object, and must not be
   264  *                   altered or deleted by the application.  The returned string
   265  *                   will remain valid until the regular expression is closed.
   266  * @stable ICU 3.0
   267  */
   268 U_STABLE const UChar * U_EXPORT2 
   269 uregex_pattern(const URegularExpression *regexp,
   270                      int32_t            *patLength,
   271                      UErrorCode         *status);
   273 /**
   274  *  Returns the source text of the pattern for this regular expression.
   275  *  This function will work even if the pattern was originally specified as a UChar string.
   276  *
   277  * @param regexp     The compiled regular expression.
   278  * @param status     Receives errors detected by this function.
   279  * @return the pattern text.  The storage for the text is owned by the regular expression
   280  *                   object, and must not be altered or deleted.
   281  *
   282  * @stable ICU 4.6
   283  */
   284 U_STABLE UText * U_EXPORT2 
   285 uregex_patternUText(const URegularExpression *regexp,
   286                           UErrorCode         *status);
   288 /**
   289   * Get the match mode flags that were specified when compiling this regular expression.
   290   * @param status   Receives errors detected by this function.
   291   * @param regexp   The compiled regular expression.
   292   * @return         The match mode flags
   293   * @see URegexpFlag
   294   * @stable ICU 3.0
   295   */
   296 U_STABLE int32_t U_EXPORT2 
   297 uregex_flags(const  URegularExpression   *regexp,
   298                     UErrorCode           *status);
   301 /**
   302   *  Set the subject text string upon which the regular expression will look for matches.
   303   *  This function may be called any number of times, allowing the regular
   304   *  expression pattern to be applied to different strings.
   305   *  <p>
   306   *  Regular expression matching operations work directly on the application's
   307   *  string data.  No copy is made.  The subject string data must not be
   308   *  altered after calling this function until after all regular expression
   309   *  operations involving this string data are completed.  
   310   *  <p>
   311   *  Zero length strings are permitted.  In this case, no subsequent match
   312   *  operation will dereference the text string pointer.
   313   *
   314   * @param regexp     The compiled regular expression.
   315   * @param text       The subject text string.
   316   * @param textLength The length of the subject text, or -1 if the string
   317   *                   is NUL terminated.
   318   * @param status     Receives errors detected by this function.
   319   * @stable ICU 3.0
   320   */
   321 U_STABLE void U_EXPORT2 
   322 uregex_setText(URegularExpression *regexp,
   323                const UChar        *text,
   324                int32_t             textLength,
   325                UErrorCode         *status);
   328 /**
   329   *  Set the subject text string upon which the regular expression will look for matches.
   330   *  This function may be called any number of times, allowing the regular
   331   *  expression pattern to be applied to different strings.
   332   *  <p>
   333   *  Regular expression matching operations work directly on the application's
   334   *  string data; only a shallow clone is made.  The subject string data must not be
   335   *  altered after calling this function until after all regular expression
   336   *  operations involving this string data are completed.  
   337   *
   338   * @param regexp     The compiled regular expression.
   339   * @param text       The subject text string.
   340   * @param status     Receives errors detected by this function.
   341   *
   342   * @stable ICU 4.6
   343   */
   344 U_STABLE void U_EXPORT2 
   345 uregex_setUText(URegularExpression *regexp,
   346                 UText              *text,
   347                 UErrorCode         *status);
   349 /**
   350   *  Get the subject text that is currently associated with this 
   351   *   regular expression object.  If the input was supplied using uregex_setText(),
   352   *   that pointer will be returned.  Otherwise, the characters in the input will
   353   *   be extracted to a buffer and returned.  In either case, ownership remains
   354   *   with the regular expression object.
   355   *
   356   *  This function will work even if the input was originally specified as a UText.
   357   *
   358   * @param regexp      The compiled regular expression.
   359   * @param textLength  The length of the string is returned in this output parameter. 
   360   *                    A NULL pointer may be used here if the
   361   *                    text length is not needed, as would be the case if
   362   *                    the text is known in advance to be a NUL terminated
   363   *                    string.
   364   * @param status      Receives errors detected by this function.
   365   * @return            Pointer to the subject text string currently associated with
   366   *                    this regular expression.
   367   * @stable ICU 3.0
   368   */
   369 U_STABLE const UChar * U_EXPORT2 
   370 uregex_getText(URegularExpression *regexp,
   371                int32_t            *textLength,
   372                UErrorCode         *status);
   374 /**
   375   *  Get the subject text that is currently associated with this 
   376   *   regular expression object.
   377   *
   378   *  This function will work even if the input was originally specified as a UChar string.
   379   *
   380   * @param regexp      The compiled regular expression.
   381   * @param dest        A mutable UText in which to store the current input.
   382   *                    If NULL, a new UText will be created as an immutable shallow clone
   383   *                    of the actual input string.
   384   * @param status      Receives errors detected by this function.
   385   * @return            The subject text currently associated with this regular expression.
   386   *                    If a pre-allocated UText was provided, it will always be used and returned.
   387   *
   388   * @stable ICU 4.6
   389   */
   390 U_STABLE UText * U_EXPORT2 
   391 uregex_getUText(URegularExpression *regexp,
   392                 UText              *dest,
   393                 UErrorCode         *status);
   395 /**
   396   *  Set the subject text string upon which the regular expression is looking for matches
   397   *  without changing any other aspect of the matching state.
   398   *  The new and previous text strings must have the same content.
   399   *
   400   *  This function is intended for use in environments where ICU is operating on 
   401   *  strings that may move around in memory.  It provides a mechanism for notifying
   402   *  ICU that the string has been relocated, and providing a new UText to access the
   403   *  string in its new position.
   404   *
   405   *  Note that the regular expression implementation never copies the underlying text
   406   *  of a string being matched, but always operates directly on the original text 
   407   *  provided by the user. Refreshing simply drops the references to the old text 
   408   *  and replaces them with references to the new.
   409   *
   410   *  Caution:  this function is normally used only by very specialized
   411   *            system-level code.   One example use case is with garbage collection 
   412   *            that moves the text in memory. 
   413   *
   414   * @param regexp     The compiled regular expression.
   415   * @param text       The new (moved) text string.
   416   * @param status     Receives errors detected by this function.
   417   *
   418   * @stable ICU 4.8
   419   */
   420 U_STABLE void U_EXPORT2 
   421 uregex_refreshUText(URegularExpression *regexp,
   422                     UText              *text,
   423                     UErrorCode         *status);
   425 /**
   426   *   Attempts to match the input string against the pattern.
   427   *   To succeed, the match must extend to the end of the string,
   428   *   or cover the complete match region.
   429   *
   430   *   If startIndex >= zero the match operation starts at the specified
   431   *   index and must extend to the end of the input string.  Any region
   432   *   that has been specified is reset.
   433   *
   434   *   If startIndex == -1 the match must cover the input region, or the entire
   435   *   input string if no region has been set.  This directly corresponds to
   436   *   Matcher.matches() in Java
   437   *
   438   *    @param  regexp      The compiled regular expression.
   439   *    @param  startIndex  The input string (native) index at which to begin matching, or -1
   440   *                        to match the input Region.
   441   *    @param  status      Receives errors detected by this function.
   442   *    @return             TRUE if there is a match
   443   *    @stable ICU 3.0
   444   */
   445 U_STABLE UBool U_EXPORT2 
   446 uregex_matches(URegularExpression *regexp,
   447                 int32_t            startIndex,
   448                 UErrorCode        *status);
   450 /**
   451   *   64bit version of uregex_matches.
   452   *   Attempts to match the input string against the pattern.
   453   *   To succeed, the match must extend to the end of the string,
   454   *   or cover the complete match region.
   455   *
   456   *   If startIndex >= zero the match operation starts at the specified
   457   *   index and must extend to the end of the input string.  Any region
   458   *   that has been specified is reset.
   459   *
   460   *   If startIndex == -1 the match must cover the input region, or the entire
   461   *   input string if no region has been set.  This directly corresponds to
   462   *   Matcher.matches() in Java
   463   *
   464   *    @param  regexp      The compiled regular expression.
   465   *    @param  startIndex  The input string (native) index at which to begin matching, or -1
   466   *                        to match the input Region.
   467   *    @param  status      Receives errors detected by this function.
   468   *    @return             TRUE if there is a match
   469   *   @stable ICU 4.6
   470   */
   471 U_STABLE UBool U_EXPORT2 
   472 uregex_matches64(URegularExpression *regexp,
   473                  int64_t            startIndex,
   474                  UErrorCode        *status);
   476 /**
   477   *   Attempts to match the input string, starting from the specified index, against the pattern.
   478   *   The match may be of any length, and is not required to extend to the end
   479   *   of the input string.  Contrast with uregex_matches().
   480   *
   481   *   <p>If startIndex is >= 0 any input region that was set for this
   482   *   URegularExpression is reset before the operation begins.
   483   *
   484   *   <p>If the specified starting index == -1 the match begins at the start of the input 
   485   *   region, or at the start of the full string if no region has been specified.
   486   *   This corresponds directly with Matcher.lookingAt() in Java.
   487   *
   488   *   <p>If the match succeeds then more information can be obtained via the
   489   *    <code>uregexp_start()</code>, <code>uregexp_end()</code>,
   490   *    and <code>uregexp_group()</code> functions.</p>
   491   *
   492   *    @param   regexp      The compiled regular expression.
   493   *    @param   startIndex  The input string (native) index at which to begin matching, or
   494   *                         -1 to match the Input Region
   495   *    @param   status      A reference to a UErrorCode to receive any errors.
   496   *    @return  TRUE if there is a match.
   497   *    @stable ICU 3.0
   498   */
   499 U_STABLE UBool U_EXPORT2 
   500 uregex_lookingAt(URegularExpression *regexp,
   501                  int32_t             startIndex,
   502                  UErrorCode         *status);
   504 /**
   505   *   64bit version of uregex_lookingAt.
   506   *   Attempts to match the input string, starting from the specified index, against the pattern.
   507   *   The match may be of any length, and is not required to extend to the end
   508   *   of the input string.  Contrast with uregex_matches().
   509   *
   510   *   <p>If startIndex is >= 0 any input region that was set for this
   511   *   URegularExpression is reset before the operation begins.
   512   *
   513   *   <p>If the specified starting index == -1 the match begins at the start of the input 
   514   *   region, or at the start of the full string if no region has been specified.
   515   *   This corresponds directly with Matcher.lookingAt() in Java.
   516   *
   517   *   <p>If the match succeeds then more information can be obtained via the
   518   *    <code>uregexp_start()</code>, <code>uregexp_end()</code>,
   519   *    and <code>uregexp_group()</code> functions.</p>
   520   *
   521   *    @param   regexp      The compiled regular expression.
   522   *    @param   startIndex  The input string (native) index at which to begin matching, or
   523   *                         -1 to match the Input Region
   524   *    @param   status      A reference to a UErrorCode to receive any errors.
   525   *    @return  TRUE if there is a match.
   526   *    @stable ICU 4.6
   527   */
   528 U_STABLE UBool U_EXPORT2 
   529 uregex_lookingAt64(URegularExpression *regexp,
   530                    int64_t             startIndex,
   531                    UErrorCode         *status);
   533 /**
   534   *   Find the first matching substring of the input string that matches the pattern.
   535   *   If startIndex is >= zero the search for a match begins at the specified index,
   536   *          and any match region is reset.  This corresponds directly with
   537   *          Matcher.find(startIndex) in Java.
   538   *
   539   *   If startIndex == -1 the search begins at the start of the input region,
   540   *           or at the start of the full string if no region has been specified.
   541   *
   542   *   If a match is found, <code>uregex_start(), uregex_end()</code>, and
   543   *   <code>uregex_group()</code> will provide more information regarding the match.
   544   *
   545   *   @param   regexp      The compiled regular expression.
   546   *   @param   startIndex  The position (native) in the input string to begin the search, or
   547   *                        -1 to search within the Input Region.
   548   *   @param   status      A reference to a UErrorCode to receive any errors.
   549   *   @return              TRUE if a match is found.
   550   *   @stable ICU 3.0
   551   */
   552 U_STABLE UBool U_EXPORT2 
   553 uregex_find(URegularExpression *regexp,
   554             int32_t             startIndex, 
   555             UErrorCode         *status);
   557 /**
   558   *   64bit version of uregex_find.
   559   *   Find the first matching substring of the input string that matches the pattern.
   560   *   If startIndex is >= zero the search for a match begins at the specified index,
   561   *          and any match region is reset.  This corresponds directly with
   562   *          Matcher.find(startIndex) in Java.
   563   *
   564   *   If startIndex == -1 the search begins at the start of the input region,
   565   *           or at the start of the full string if no region has been specified.
   566   *
   567   *   If a match is found, <code>uregex_start(), uregex_end()</code>, and
   568   *   <code>uregex_group()</code> will provide more information regarding the match.
   569   *
   570   *   @param   regexp      The compiled regular expression.
   571   *   @param   startIndex  The position (native) in the input string to begin the search, or
   572   *                        -1 to search within the Input Region.
   573   *   @param   status      A reference to a UErrorCode to receive any errors.
   574   *   @return              TRUE if a match is found.
   575   *   @stable ICU 4.6
   576   */
   577 U_STABLE UBool U_EXPORT2 
   578 uregex_find64(URegularExpression *regexp,
   579               int64_t             startIndex, 
   580               UErrorCode         *status);
   582 /**
   583   *  Find the next pattern match in the input string.  Begin searching 
   584   *  the input at the location following the end of he previous match, 
   585   *  or at the start of the string (or region) if there is no 
   586   *  previous match.  If a match is found, <code>uregex_start(), uregex_end()</code>, and
   587   *  <code>uregex_group()</code> will provide more information regarding the match.
   588   *
   589   *  @param   regexp      The compiled regular expression.
   590   *  @param   status      A reference to a UErrorCode to receive any errors.
   591   *  @return              TRUE if a match is found.
   592   *  @see uregex_reset
   593   *  @stable ICU 3.0
   594   */
   595 U_STABLE UBool U_EXPORT2 
   596 uregex_findNext(URegularExpression *regexp,
   597                 UErrorCode         *status);
   599 /**
   600   *   Get the number of capturing groups in this regular expression's pattern.
   601   *   @param   regexp      The compiled regular expression.
   602   *   @param   status      A reference to a UErrorCode to receive any errors.
   603   *   @return the number of capture groups
   604   *   @stable ICU 3.0
   605   */
   606 U_STABLE int32_t U_EXPORT2 
   607 uregex_groupCount(URegularExpression *regexp,
   608                   UErrorCode         *status);
   610 /** Extract the string for the specified matching expression or subexpression.
   611   * Group #0 is the complete string of matched text.
   612   * Group #1 is the text matched by the first set of capturing parentheses.
   613   *
   614   *   @param   regexp       The compiled regular expression.
   615   *   @param   groupNum     The capture group to extract.  Group 0 is the complete
   616   *                         match.  The value of this parameter must be
   617   *                         less than or equal to the number of capture groups in
   618   *                         the pattern.
   619   *   @param   dest         Buffer to receive the matching string data
   620   *   @param   destCapacity Capacity of the dest buffer.
   621   *   @param   status       A reference to a UErrorCode to receive any errors.
   622   *   @return               Length of matching data,
   623   *                         or -1 if no applicable match.
   624   *   @stable ICU 3.0
   625   */
   626 U_STABLE int32_t U_EXPORT2 
   627 uregex_group(URegularExpression *regexp,
   628              int32_t             groupNum,
   629              UChar              *dest,
   630              int32_t             destCapacity,
   631              UErrorCode          *status);
   633 /** Returns a shallow immutable clone of the entire input string.  The returned UText current native index
   634   *   is set to the beginning of the requested capture group.  The capture group length is also
   635   *   returned via groupLength.
   636   * Group #0 is the complete string of matched text.
   637   * Group #1 is the text matched by the first set of capturing parentheses.
   638   *
   639   *   @param   regexp       The compiled regular expression.
   640   *   @param   groupNum     The capture group to extract.  Group 0 is the complete
   641   *                         match.  The value of this parameter must be
   642   *                         less than or equal to the number of capture groups in
   643   *                         the pattern.
   644   *   @param   dest         A mutable UText in which to store the current input.
   645   *                         If NULL, a new UText will be created as an immutable shallow clone
   646   *                         of the entire input string.
   647   *   @param   groupLength  The group length of the desired capture group.
   648   *   @param   status       A reference to a UErrorCode to receive any errors.
   649   *   @return               The subject text currently associated with this regular expression.
   650   *                         If a pre-allocated UText was provided, it will always be used and returned.
   652   *
   653   *   @stable ICU 4.6
   654   */
   655 U_STABLE UText * U_EXPORT2 
   656 uregex_groupUText(URegularExpression *regexp,
   657                   int32_t             groupNum,
   658                   UText              *dest,
   659                   int64_t            *groupLength,
   660                   UErrorCode         *status);
   662 #ifndef U_HIDE_INTERNAL_API
   663 /** Extract the string for the specified matching expression or subexpression.
   664   * Group #0 is the complete string of matched text.
   665   * Group #1 is the text matched by the first set of capturing parentheses.
   666   *
   667   *   @param   regexp       The compiled regular expression.
   668   *   @param   groupNum     The capture group to extract.  Group 0 is the complete
   669   *                         match.  The value of this parameter must be
   670   *                         less than or equal to the number of capture groups in
   671   *                         the pattern.
   672   *   @param   dest         Mutable UText to receive the matching string data.
   673   *                         If NULL, a new UText will be created (which may not be mutable).
   674   *   @param   status       A reference to a UErrorCode to receive any errors.
   675   *   @return               The matching string data. If a pre-allocated UText was provided,
   676   *                          it will always be used and returned.
   677   *
   678   *   @internal ICU 4.4 technology preview
   679   */
   680 U_INTERNAL UText * U_EXPORT2 
   681 uregex_groupUTextDeep(URegularExpression *regexp,
   682                   int32_t             groupNum,
   683                   UText              *dest,
   684                   UErrorCode         *status);
   685 #endif  /* U_HIDE_INTERNAL_API */
   687 /**
   688   *   Returns the index in the input string of the start of the text matched by the
   689   *   specified capture group during the previous match operation.  Return -1 if
   690   *   the capture group was not part of the last match.
   691   *   Group #0 refers to the complete range of matched text.
   692   *   Group #1 refers to the text matched by the first set of capturing parentheses.
   693   *
   694   *    @param   regexp      The compiled regular expression.
   695   *    @param   groupNum    The capture group number
   696   *    @param   status      A reference to a UErrorCode to receive any errors.
   697   *    @return              the starting (native) position in the input of the text matched 
   698   *                         by the specified group.
   699   *    @stable ICU 3.0
   700   */
   701 U_STABLE int32_t U_EXPORT2 
   702 uregex_start(URegularExpression *regexp,
   703              int32_t             groupNum,
   704              UErrorCode          *status);
   706 /**
   707   *   64bit version of uregex_start.
   708   *   Returns the index in the input string of the start of the text matched by the
   709   *   specified capture group during the previous match operation.  Return -1 if
   710   *   the capture group was not part of the last match.
   711   *   Group #0 refers to the complete range of matched text.
   712   *   Group #1 refers to the text matched by the first set of capturing parentheses.
   713   *
   714   *    @param   regexp      The compiled regular expression.
   715   *    @param   groupNum    The capture group number
   716   *    @param   status      A reference to a UErrorCode to receive any errors.
   717   *    @return              the starting (native) position in the input of the text matched 
   718   *                         by the specified group.
   719   *   @stable ICU 4.6
   720   */
   721 U_STABLE int64_t U_EXPORT2 
   722 uregex_start64(URegularExpression *regexp,
   723                int32_t             groupNum,
   724                UErrorCode          *status);
   726 /**
   727   *   Returns the index in the input string of the position following the end
   728   *   of the text matched by the specified capture group.
   729   *   Return -1 if the capture group was not part of the last match.
   730   *   Group #0 refers to the complete range of matched text.
   731   *   Group #1 refers to the text matched by the first set of capturing parentheses.
   732   *
   733   *    @param   regexp      The compiled regular expression.
   734   *    @param   groupNum    The capture group number
   735   *    @param   status      A reference to a UErrorCode to receive any errors.
   736   *    @return              the (native) index of the position following the last matched character.
   737   *    @stable ICU 3.0
   738   */
   739 U_STABLE int32_t U_EXPORT2 
   740 uregex_end(URegularExpression   *regexp,
   741            int32_t               groupNum,
   742            UErrorCode           *status);
   744 /**
   745   *   64bit version of uregex_end.
   746   *   Returns the index in the input string of the position following the end
   747   *   of the text matched by the specified capture group.
   748   *   Return -1 if the capture group was not part of the last match.
   749   *   Group #0 refers to the complete range of matched text.
   750   *   Group #1 refers to the text matched by the first set of capturing parentheses.
   751   *
   752   *    @param   regexp      The compiled regular expression.
   753   *    @param   groupNum    The capture group number
   754   *    @param   status      A reference to a UErrorCode to receive any errors.
   755   *    @return              the (native) index of the position following the last matched character.
   756   *   @stable ICU 4.6
   757   */
   758 U_STABLE int64_t U_EXPORT2 
   759 uregex_end64(URegularExpression *regexp,
   760              int32_t               groupNum,
   761              UErrorCode           *status);
   763 /**
   764   *  Reset any saved state from the previous match.  Has the effect of
   765   *  causing uregex_findNext to begin at the specified index, and causing
   766   *  uregex_start(), uregex_end() and uregex_group() to return an error 
   767   *  indicating that there is no match information available.  Clears any
   768   *  match region that may have been set.
   769   *
   770   *    @param   regexp      The compiled regular expression.
   771   *    @param   index       The position (native) in the text at which a
   772   *                         uregex_findNext() should begin searching.
   773   *    @param   status      A reference to a UErrorCode to receive any errors.
   774   *    @stable ICU 3.0
   775   */
   776 U_STABLE void U_EXPORT2 
   777 uregex_reset(URegularExpression    *regexp,
   778              int32_t               index,
   779              UErrorCode            *status);
   781 /**
   782   *  64bit version of uregex_reset.
   783   *  Reset any saved state from the previous match.  Has the effect of
   784   *  causing uregex_findNext to begin at the specified index, and causing
   785   *  uregex_start(), uregex_end() and uregex_group() to return an error 
   786   *  indicating that there is no match information available.  Clears any
   787   *  match region that may have been set.
   788   *
   789   *    @param   regexp      The compiled regular expression.
   790   *    @param   index       The position (native) in the text at which a
   791   *                         uregex_findNext() should begin searching.
   792   *    @param   status      A reference to a UErrorCode to receive any errors.
   793   *    @stable ICU 4.6
   794   */
   795 U_STABLE void U_EXPORT2 
   796 uregex_reset64(URegularExpression  *regexp,
   797                int64_t               index,
   798                UErrorCode            *status);
   800 /**
   801   * Sets the limits of the matching region for this URegularExpression.
   802   * The region is the part of the input string that will be considered when matching.
   803   * Invoking this method resets any saved state from the previous match, 
   804   * then sets the region to start at the index specified by the start parameter
   805   * and end at the index specified by the end parameter.
   806   *
   807   * Depending on the transparency and anchoring being used (see useTransparentBounds
   808   * and useAnchoringBounds), certain constructs such as anchors may behave differently
   809   * at or around the boundaries of the region
   810   *
   811   * The function will fail if start is greater than limit, or if either index
   812   *  is less than zero or greater than the length of the string being matched.
   813   *
   814   * @param regexp The compiled regular expression.
   815   * @param regionStart  The (native) index to begin searches at.
   816   * @param regionLimit  The (native) index to end searches at (exclusive).
   817   * @param status A pointer to a UErrorCode to receive any errors.
   818   * @stable ICU 4.0
   819   */
   820 U_STABLE void U_EXPORT2
   821 uregex_setRegion(URegularExpression   *regexp,
   822                  int32_t               regionStart,
   823                  int32_t               regionLimit,
   824                  UErrorCode           *status);
   826 /**
   827   * 64bit version of uregex_setRegion.
   828   * Sets the limits of the matching region for this URegularExpression.
   829   * The region is the part of the input string that will be considered when matching.
   830   * Invoking this method resets any saved state from the previous match, 
   831   * then sets the region to start at the index specified by the start parameter
   832   * and end at the index specified by the end parameter.
   833   *
   834   * Depending on the transparency and anchoring being used (see useTransparentBounds
   835   * and useAnchoringBounds), certain constructs such as anchors may behave differently
   836   * at or around the boundaries of the region
   837   *
   838   * The function will fail if start is greater than limit, or if either index
   839   *  is less than zero or greater than the length of the string being matched.
   840   *
   841   * @param regexp The compiled regular expression.
   842   * @param regionStart  The (native) index to begin searches at.
   843   * @param regionLimit  The (native) index to end searches at (exclusive).
   844   * @param status A pointer to a UErrorCode to receive any errors.
   845   * @stable ICU 4.6
   846   */
   847 U_STABLE void U_EXPORT2 
   848 uregex_setRegion64(URegularExpression *regexp,
   849                  int64_t               regionStart,
   850                  int64_t               regionLimit,
   851                  UErrorCode           *status);
   853 /**
   854   *  Set the matching region and the starting index for subsequent matches
   855   *  in a single operation.
   856   *  This is useful because the usual function for setting the starting
   857   *  index, urgex_reset(), also resets any region limits.
   858   *
   859   * @param regexp The compiled regular expression.
   860   * @param regionStart  The (native) index to begin searches at.
   861   * @param regionLimit  The (native) index to end searches at (exclusive).
   862   * @param startIndex   The index in the input text at which the next 
   863   *                     match operation should begin.
   864   * @param status A pointer to a UErrorCode to receive any errors.
   865   * @stable ICU 4.6
   866   */
   867 U_STABLE void U_EXPORT2 
   868 uregex_setRegionAndStart(URegularExpression *regexp,
   869                  int64_t               regionStart,
   870                  int64_t               regionLimit,
   871                  int64_t               startIndex,
   872                  UErrorCode           *status);
   874 /**
   875   * Reports the start index of the matching region. Any matches found are limited to
   876   * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
   877   *
   878   * @param regexp The compiled regular expression.
   879   * @param status A pointer to a UErrorCode to receive any errors.
   880   * @return The starting (native) index of this matcher's region.
   881   * @stable ICU 4.0
   882   */
   883 U_STABLE int32_t U_EXPORT2
   884 uregex_regionStart(const  URegularExpression   *regexp,
   885                           UErrorCode           *status);
   887 /**
   888   * 64bit version of uregex_regionStart.
   889   * Reports the start index of the matching region. Any matches found are limited to
   890   * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
   891   *
   892   * @param regexp The compiled regular expression.
   893   * @param status A pointer to a UErrorCode to receive any errors.
   894   * @return The starting (native) index of this matcher's region.
   895   * @stable ICU 4.6
   896   */
   897 U_STABLE int64_t U_EXPORT2 
   898 uregex_regionStart64(const  URegularExpression   *regexp,
   899                             UErrorCode           *status);
   901 /**
   902   * Reports the end index (exclusive) of the matching region for this URegularExpression.
   903   * Any matches found are limited to to the region bounded by regionStart (inclusive)
   904   * and regionEnd (exclusive).
   905   *
   906   * @param regexp The compiled regular expression.
   907   * @param status A pointer to a UErrorCode to receive any errors.
   908   * @return The ending point (native) of this matcher's region.
   909   * @stable ICU 4.0
   910   */
   911 U_STABLE int32_t U_EXPORT2
   912 uregex_regionEnd(const  URegularExpression   *regexp,
   913                         UErrorCode           *status);
   915 /**
   916   * 64bit version of uregex_regionEnd.
   917   * Reports the end index (exclusive) of the matching region for this URegularExpression.
   918   * Any matches found are limited to to the region bounded by regionStart (inclusive)
   919   * and regionEnd (exclusive).
   920   *
   921   * @param regexp The compiled regular expression.
   922   * @param status A pointer to a UErrorCode to receive any errors.
   923   * @return The ending point (native) of this matcher's region.
   924   * @stable ICU 4.6
   925   */
   926 U_STABLE int64_t U_EXPORT2 
   927 uregex_regionEnd64(const  URegularExpression   *regexp,
   928                           UErrorCode           *status);
   930 /**
   931   * Queries the transparency of region bounds for this URegularExpression.
   932   * See useTransparentBounds for a description of transparent and opaque bounds.
   933   * By default, matching boundaries are opaque.
   934   *
   935   * @param regexp The compiled regular expression.
   936   * @param status A pointer to a UErrorCode to receive any errors.
   937   * @return TRUE if this matcher is using opaque bounds, false if it is not.
   938   * @stable ICU 4.0
   939   */
   940 U_STABLE UBool U_EXPORT2
   941 uregex_hasTransparentBounds(const  URegularExpression   *regexp,
   942                                    UErrorCode           *status);
   945 /**
   946   * Sets the transparency of region bounds for this URegularExpression.
   947   * Invoking this function with an argument of TRUE will set matches to use transparent bounds.
   948   * If the boolean argument is FALSE, then opaque bounds will be used.
   949   *
   950   * Using transparent bounds, the boundaries of the matching region are transparent
   951   * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
   952   * see text beyond the boundaries of the region while checking for a match.
   953   *
   954   * With opaque bounds, no text outside of the matching region is visible to lookahead,
   955   * lookbehind, and boundary matching constructs.
   956   *
   957   * By default, opaque bounds are used.
   958   *
   959   * @param   regexp The compiled regular expression.
   960   * @param   b      TRUE for transparent bounds; FALSE for opaque bounds
   961   * @param   status A pointer to a UErrorCode to receive any errors.
   962   * @stable ICU 4.0
   963   **/
   964 U_STABLE void U_EXPORT2  
   965 uregex_useTransparentBounds(URegularExpression   *regexp, 
   966                             UBool                b,
   967                             UErrorCode           *status);
   970 /**
   971   * Return true if this URegularExpression is using anchoring bounds.
   972   * By default, anchoring region bounds are used.
   973   *
   974   * @param  regexp The compiled regular expression.
   975   * @param  status A pointer to a UErrorCode to receive any errors.
   976   * @return TRUE if this matcher is using anchoring bounds.
   977   * @stable ICU 4.0
   978   */
   979 U_STABLE UBool U_EXPORT2
   980 uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
   981                                  UErrorCode           *status);
   984 /**
   985   * Set whether this URegularExpression is using Anchoring Bounds for its region.
   986   * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
   987   * and end of the region.  Without Anchoring Bounds, anchors will only match at
   988   * the positions they would in the complete text.
   989   *
   990   * Anchoring Bounds are the default for regions.
   991   *
   992   * @param regexp The compiled regular expression.
   993   * @param b      TRUE if to enable anchoring bounds; FALSE to disable them.
   994   * @param status A pointer to a UErrorCode to receive any errors.
   995   * @stable ICU 4.0
   996   */
   997 U_STABLE void U_EXPORT2
   998 uregex_useAnchoringBounds(URegularExpression   *regexp,
   999                           UBool                 b,
  1000                           UErrorCode           *status);
  1002 /**
  1003   * Return TRUE if the most recent matching operation touched the
  1004   *  end of the text being processed.  In this case, additional input text could
  1005   *  change the results of that match.
  1007   *  @param regexp The compiled regular expression.
  1008   *  @param status A pointer to a UErrorCode to receive any errors.
  1009   *  @return  TRUE if the most recent match hit the end of input
  1010   *  @stable ICU 4.0
  1011   */
  1012 U_STABLE UBool U_EXPORT2
  1013 uregex_hitEnd(const  URegularExpression   *regexp,
  1014                      UErrorCode           *status);
  1016 /**
  1017   * Return TRUE the most recent match succeeded and additional input could cause
  1018   * it to fail. If this function returns false and a match was found, then more input
  1019   * might change the match but the match won't be lost. If a match was not found,
  1020   * then requireEnd has no meaning.
  1022   * @param regexp The compiled regular expression.
  1023   * @param status A pointer to a UErrorCode to receive any errors.
  1024   * @return TRUE  if more input could cause the most recent match to no longer match.
  1025   * @stable ICU 4.0
  1026   */
  1027 U_STABLE UBool U_EXPORT2   
  1028 uregex_requireEnd(const  URegularExpression   *regexp,
  1029                          UErrorCode           *status);
  1035 /**
  1036   *    Replaces every substring of the input that matches the pattern
  1037   *    with the given replacement string.  This is a convenience function that
  1038   *    provides a complete find-and-replace-all operation.
  1040   *    This method scans the input string looking for matches of the pattern. 
  1041   *    Input that is not part of any match is copied unchanged to the
  1042   *    destination buffer.  Matched regions are replaced in the output
  1043   *    buffer by the replacement string.   The replacement string may contain
  1044   *    references to capture groups; these take the form of $1, $2, etc.
  1046   *    @param   regexp             The compiled regular expression.
  1047   *    @param   replacementText    A string containing the replacement text.
  1048   *    @param   replacementLength  The length of the replacement string, or
  1049   *                                -1 if it is NUL terminated.
  1050   *    @param   destBuf            A (UChar *) buffer that will receive the result.
  1051   *    @param   destCapacity       The capacity of the destination buffer.
  1052   *    @param   status             A reference to a UErrorCode to receive any errors.
  1053   *    @return                     The length of the string resulting from the find
  1054   *                                and replace operation.  In the event that the
  1055   *                                destination capacity is inadequate, the return value
  1056   *                                is still the full length of the untruncated string.
  1057   *    @stable ICU 3.0
  1058   */
  1059 U_STABLE int32_t U_EXPORT2 
  1060 uregex_replaceAll(URegularExpression    *regexp,
  1061                   const UChar           *replacementText,
  1062                   int32_t                replacementLength,
  1063                   UChar                 *destBuf,
  1064                   int32_t                destCapacity,
  1065                   UErrorCode            *status);
  1067 /**
  1068   *    Replaces every substring of the input that matches the pattern
  1069   *    with the given replacement string.  This is a convenience function that
  1070   *    provides a complete find-and-replace-all operation.
  1072   *    This method scans the input string looking for matches of the pattern. 
  1073   *    Input that is not part of any match is copied unchanged to the
  1074   *    destination buffer.  Matched regions are replaced in the output
  1075   *    buffer by the replacement string.   The replacement string may contain
  1076   *    references to capture groups; these take the form of $1, $2, etc.
  1078   *    @param   regexp         The compiled regular expression.
  1079   *    @param   replacement    A string containing the replacement text.
  1080   *    @param   dest           A mutable UText that will receive the result.
  1081   *                             If NULL, a new UText will be created (which may not be mutable).
  1082   *    @param   status         A reference to a UErrorCode to receive any errors.
  1083   *    @return                 A UText containing the results of the find and replace.
  1084   *                             If a pre-allocated UText was provided, it will always be used and returned.
  1086   *    @stable ICU 4.6
  1087   */
  1088 U_STABLE UText * U_EXPORT2 
  1089 uregex_replaceAllUText(URegularExpression *regexp,
  1090                        UText              *replacement,
  1091                        UText              *dest,
  1092                        UErrorCode         *status);
  1094 /**
  1095   *    Replaces the first substring of the input that matches the pattern
  1096   *    with the given replacement string.  This is a convenience function that
  1097   *    provides a complete find-and-replace operation.
  1099   *    This method scans the input string looking for a match of the pattern. 
  1100   *    All input that is not part of the match is copied unchanged to the
  1101   *    destination buffer.  The matched region is replaced in the output
  1102   *    buffer by the replacement string.   The replacement string may contain
  1103   *    references to capture groups; these take the form of $1, $2, etc.
  1105   *    @param   regexp             The compiled regular expression.
  1106   *    @param   replacementText    A string containing the replacement text.
  1107   *    @param   replacementLength  The length of the replacement string, or
  1108   *                                -1 if it is NUL terminated.
  1109   *    @param   destBuf            A (UChar *) buffer that will receive the result.
  1110   *    @param   destCapacity       The capacity of the destination buffer.
  1111   *    @param   status             a reference to a UErrorCode to receive any errors.
  1112   *    @return                     The length of the string resulting from the find
  1113   *                                and replace operation.  In the event that the
  1114   *                                destination capacity is inadequate, the return value
  1115   *                                is still the full length of the untruncated string.
  1116   *    @stable ICU 3.0
  1117   */
  1118 U_STABLE int32_t U_EXPORT2 
  1119 uregex_replaceFirst(URegularExpression  *regexp,
  1120                     const UChar         *replacementText,
  1121                     int32_t              replacementLength,
  1122                     UChar               *destBuf,
  1123                     int32_t              destCapacity,
  1124                     UErrorCode          *status);
  1126 /**
  1127   *    Replaces the first substring of the input that matches the pattern
  1128   *    with the given replacement string.  This is a convenience function that
  1129   *    provides a complete find-and-replace operation.
  1131   *    This method scans the input string looking for a match of the pattern. 
  1132   *    All input that is not part of the match is copied unchanged to the
  1133   *    destination buffer.  The matched region is replaced in the output
  1134   *    buffer by the replacement string.   The replacement string may contain
  1135   *    references to capture groups; these take the form of $1, $2, etc.
  1137   *    @param   regexp         The compiled regular expression.
  1138   *    @param   replacement    A string containing the replacement text.
  1139   *    @param   dest           A mutable UText that will receive the result.
  1140   *                             If NULL, a new UText will be created (which may not be mutable).
  1141   *    @param   status         A reference to a UErrorCode to receive any errors.
  1142   *    @return                 A UText containing the results of the find and replace.
  1143   *                             If a pre-allocated UText was provided, it will always be used and returned.
  1145   *    @stable ICU 4.6
  1146   */
  1147 U_STABLE UText * U_EXPORT2 
  1148 uregex_replaceFirstUText(URegularExpression *regexp,
  1149                          UText              *replacement,
  1150                          UText              *dest,
  1151                          UErrorCode         *status);
  1153 /**
  1154   *   Implements a replace operation intended to be used as part of an
  1155   *   incremental find-and-replace.
  1157   *   <p>The input string, starting from the end of the previous match and ending at
  1158   *   the start of the current match, is appended to the destination string.  Then the
  1159   *   replacement string is appended to the output string,
  1160   *   including handling any substitutions of captured text.</p>
  1162   *   <p>A note on preflight computation of buffersize and error handling:
  1163   *   Calls to uregex_appendReplacement() and uregex_appendTail() are
  1164   *   designed to be chained, one after another, with the destination
  1165   *   buffer pointer and buffer capacity updated after each in preparation
  1166   *   to for the next.  If the destination buffer is exhausted partway through such a
  1167   *   sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned.  Normal
  1168   *   ICU conventions are for a function to perform no action if it is
  1169   *   called with an error status, but for this one case, uregex_appendRepacement()
  1170   *   will operate normally so that buffer size computations will complete
  1171   *   correctly.
  1173   *   <p>For simple, prepackaged, non-incremental find-and-replace
  1174   *      operations, see replaceFirst() or replaceAll().</p>
  1176   *   @param   regexp      The regular expression object.  
  1177   *   @param   replacementText The string that will replace the matched portion of the
  1178   *                        input string as it is copied to the destination buffer.
  1179   *                        The replacement text may contain references ($1, for
  1180   *                        example) to capture groups from the match.
  1181   *   @param   replacementLength  The length of the replacement text string,
  1182   *                        or -1 if the string is NUL terminated.
  1183   *   @param   destBuf     The buffer into which the results of the
  1184   *                        find-and-replace are placed.  On return, this pointer
  1185   *                        will be updated to refer to the beginning of the
  1186   *                        unused portion of buffer, leaving it in position for
  1187   *                        a subsequent call to this function.
  1188   *   @param   destCapacity The size of the output buffer,  On return, this
  1189   *                        parameter will be updated to reflect the space remaining
  1190   *                        unused in the output buffer.
  1191   *   @param   status      A reference to a UErrorCode to receive any errors. 
  1192   *   @return              The length of the result string.  In the event that
  1193   *                        destCapacity is inadequate, the full length of the
  1194   *                        untruncated output string is returned.
  1196   *   @stable ICU 3.0
  1198   */
  1199 U_STABLE int32_t U_EXPORT2 
  1200 uregex_appendReplacement(URegularExpression    *regexp,
  1201                          const UChar           *replacementText,
  1202                          int32_t                replacementLength,
  1203                          UChar                **destBuf,
  1204                          int32_t               *destCapacity,
  1205                          UErrorCode            *status);
  1207 /**
  1208   *   Implements a replace operation intended to be used as part of an
  1209   *   incremental find-and-replace.
  1211   *   <p>The input string, starting from the end of the previous match and ending at
  1212   *   the start of the current match, is appended to the destination string.  Then the
  1213   *   replacement string is appended to the output string,
  1214   *   including handling any substitutions of captured text.</p>
  1216   *   <p>For simple, prepackaged, non-incremental find-and-replace
  1217   *      operations, see replaceFirst() or replaceAll().</p>
  1219   *   @param   regexp      The regular expression object.  
  1220   *   @param   replacementText The string that will replace the matched portion of the
  1221   *                        input string as it is copied to the destination buffer.
  1222   *                        The replacement text may contain references ($1, for
  1223   *                        example) to capture groups from the match.
  1224   *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
  1225   *   @param   status      A reference to a UErrorCode to receive any errors. 
  1227   *   @stable ICU 4.6
  1228   */
  1229 U_STABLE void U_EXPORT2 
  1230 uregex_appendReplacementUText(URegularExpression    *regexp,
  1231                               UText                 *replacementText,
  1232                               UText                 *dest,
  1233                               UErrorCode            *status);
  1235 /**
  1236   * As the final step in a find-and-replace operation, append the remainder
  1237   * of the input string, starting at the position following the last match,
  1238   * to the destination string. <code>uregex_appendTail()</code> is intended 
  1239   *  to be invoked after one or more invocations of the
  1240   *  <code>uregex_appendReplacement()</code> function.
  1242   *   @param   regexp      The regular expression object.  This is needed to 
  1243   *                        obtain the input string and with the position
  1244   *                        of the last match within it.
  1245   *   @param   destBuf     The buffer in which the results of the
  1246   *                        find-and-replace are placed.  On return, the pointer
  1247   *                        will be updated to refer to the beginning of the
  1248   *                        unused portion of buffer.
  1249   *   @param   destCapacity The size of the output buffer,  On return, this
  1250   *                        value will be updated to reflect the space remaining
  1251   *                        unused in the output buffer.
  1252   *   @param   status      A reference to a UErrorCode to receive any errors. 
  1253   *   @return              The length of the result string.  In the event that
  1254   *                        destCapacity is inadequate, the full length of the
  1255   *                        untruncated output string is returned.
  1257   *   @stable ICU 3.0
  1258   */
  1259 U_STABLE int32_t U_EXPORT2 
  1260 uregex_appendTail(URegularExpression    *regexp,
  1261                   UChar                **destBuf,
  1262                   int32_t               *destCapacity,
  1263                   UErrorCode            *status);
  1265 /**
  1266   * As the final step in a find-and-replace operation, append the remainder
  1267   * of the input string, starting at the position following the last match,
  1268   * to the destination string. <code>uregex_appendTailUText()</code> is intended 
  1269   *  to be invoked after one or more invocations of the
  1270   *  <code>uregex_appendReplacementUText()</code> function.
  1272   *   @param   regexp      The regular expression object.  This is needed to 
  1273   *                        obtain the input string and with the position
  1274   *                        of the last match within it.
  1275   *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
  1277   *   @param status        Error code
  1279   *   @return              The destination UText.
  1281   *   @stable ICU 4.6
  1282   */
  1283 U_STABLE UText * U_EXPORT2 
  1284 uregex_appendTailUText(URegularExpression    *regexp,
  1285                        UText                 *dest,
  1286                        UErrorCode            *status);
  1288  /**
  1289    * Split a string into fields.  Somewhat like split() from Perl.
  1290    *  The pattern matches identify delimiters that separate the input
  1291    *  into fields.  The input data between the matches becomes the
  1292    *  fields themselves.
  1294    *  Each of the fields is copied from the input string to the destination
  1295    *  buffer, and NUL terminated.  The position of each field within
  1296    *  the destination buffer is returned in the destFields array.
  1298    *  If the delimiter pattern includes capture groups, the captured text will
  1299    *  also appear in the destination array of output strings, interspersed
  1300    *  with the fields.  This is similar to Perl, but differs from Java, 
  1301    *  which ignores the presence of capture groups in the pattern.
  1303    *  Trailing empty fields will always be returned, assuming sufficient
  1304    *  destination capacity.  This differs from the default behavior for Java
  1305    *  and Perl where trailing empty fields are not returned.
  1307    *  The number of strings produced by the split operation is returned.
  1308    *  This count includes the strings from capture groups in the delimiter pattern.
  1309    *  This behavior differs from Java, which ignores capture groups.
  1311    *    @param   regexp      The compiled regular expression.
  1312    *    @param   destBuf     A (UChar *) buffer to receive the fields that
  1313    *                         are extracted from the input string. These
  1314    *                         field pointers will refer to positions within the
  1315    *                         destination buffer supplied by the caller.  Any
  1316    *                         extra positions within the destFields array will be
  1317    *                         set to NULL.
  1318    *    @param   destCapacity The capacity of the destBuf.
  1319    *    @param   requiredCapacity  The actual capacity required of the destBuf.
  1320    *                         If destCapacity is too small, requiredCapacity will return 
  1321    *                         the total capacity required to hold all of the output, and
  1322    *                         a U_BUFFER_OVERFLOW_ERROR will be returned.
  1323    *    @param   destFields  An array to be filled with the position of each
  1324    *                         of the extracted fields within destBuf.
  1325    *    @param   destFieldsCapacity  The number of elements in the destFields array.
  1326    *                If the number of fields found is less than destFieldsCapacity,
  1327    *                the extra destFields elements are set to zero.
  1328    *                If destFieldsCapacity is too small, the trailing part of the
  1329    *                input, including any field delimiters, is treated as if it
  1330    *                were the last field - it is copied to the destBuf, and
  1331    *                its position is in the destBuf is stored in the last element
  1332    *                of destFields.  This behavior mimics that of Perl.  It is not
  1333    *                an error condition, and no error status is returned when all destField
  1334    *                positions are used.
  1335    * @param status  A reference to a UErrorCode to receive any errors.
  1336    * @return        The number of fields into which the input string was split.
  1337    * @stable ICU 3.0
  1338    */
  1339 U_STABLE int32_t U_EXPORT2 
  1340 uregex_split(   URegularExpression      *regexp,
  1341                   UChar                 *destBuf,
  1342                   int32_t                destCapacity,
  1343                   int32_t               *requiredCapacity,
  1344                   UChar                 *destFields[],
  1345                   int32_t                destFieldsCapacity,
  1346                   UErrorCode            *status);
  1348   /**
  1349    * Split a string into fields.  Somewhat like split() from Perl.
  1350    * The pattern matches identify delimiters that separate the input
  1351    *  into fields.  The input data between the matches becomes the
  1352    *  fields themselves.
  1353    * <p>
  1354    * The behavior of this function is not very closely aligned with uregex_split();
  1355    * instead, it is based on (and implemented directly on top of) the C++ split method.
  1357    * @param regexp  The compiled regular expression.
  1358    * @param destFields    An array of mutable UText structs to receive the results of the split.
  1359    *                If a field is NULL, a new UText is allocated to contain the results for
  1360    *                that field. This new UText is not guaranteed to be mutable.
  1361    * @param destFieldsCapacity  The number of elements in the destination array.
  1362    *                If the number of fields found is less than destCapacity, the
  1363    *                extra strings in the destination array are not altered.
  1364    *                If the number of destination strings is less than the number
  1365    *                of fields, the trailing part of the input string, including any
  1366    *                field delimiters, is placed in the last destination string.
  1367    *                This behavior mimics that of Perl.  It is not  an error condition, and no
  1368    *                error status is returned when all destField positions are used.
  1369    * @param status  A reference to a UErrorCode to receive any errors.
  1370    * @return        The number of fields into which the input string was split.
  1372    * @stable ICU 4.6
  1373    */
  1374 U_STABLE int32_t U_EXPORT2 
  1375 uregex_splitUText(URegularExpression    *regexp,
  1376                   UText                 *destFields[],
  1377                   int32_t                destFieldsCapacity,
  1378                   UErrorCode            *status);
  1380 /**
  1381  * Set a processing time limit for match operations with this URegularExpression.
  1383  * Some patterns, when matching certain strings, can run in exponential time.
  1384  * For practical purposes, the match operation may appear to be in an
  1385  * infinite loop.
  1386  * When a limit is set a match operation will fail with an error if the
  1387  * limit is exceeded.
  1388  * <p>
  1389  * The units of the limit are steps of the match engine.
  1390  * Correspondence with actual processor time will depend on the speed
  1391  * of the processor and the details of the specific pattern, but will
  1392  * typically be on the order of milliseconds.
  1393  * <p>
  1394  * By default, the matching time is not limited.
  1395  * <p>
  1397  * @param   regexp      The compiled regular expression.
  1398  * @param   limit       The limit value, or 0 for no limit.
  1399  * @param   status      A reference to a UErrorCode to receive any errors.
  1400  * @stable ICU 4.0
  1401  */
  1402 U_STABLE void U_EXPORT2
  1403 uregex_setTimeLimit(URegularExpression      *regexp,
  1404                     int32_t                  limit,
  1405                     UErrorCode              *status);
  1407 /**
  1408  * Get the time limit for for matches with this URegularExpression.
  1409  * A return value of zero indicates that there is no limit.
  1411  * @param   regexp      The compiled regular expression.
  1412  * @param   status      A reference to a UErrorCode to receive any errors.
  1413  * @return the maximum allowed time for a match, in units of processing steps.
  1414  * @stable ICU 4.0
  1415  */
  1416 U_STABLE int32_t U_EXPORT2
  1417 uregex_getTimeLimit(const URegularExpression      *regexp,
  1418                           UErrorCode              *status);
  1420 /**
  1421  * Set the amount of heap storage available for use by the match backtracking stack.
  1422  * <p>
  1423  * ICU uses a backtracking regular expression engine, with the backtrack stack
  1424  * maintained on the heap.  This function sets the limit to the amount of memory
  1425  * that can be used  for this purpose.  A backtracking stack overflow will
  1426  * result in an error from the match operation that caused it.
  1427  * <p>
  1428  * A limit is desirable because a malicious or poorly designed pattern can use
  1429  * excessive memory, potentially crashing the process.  A limit is enabled
  1430  * by default.
  1431  * <p>
  1432  * @param   regexp      The compiled regular expression.
  1433  * @param   limit       The maximum size, in bytes, of the matching backtrack stack.
  1434  *                      A value of zero means no limit.
  1435  *                      The limit must be greater than or equal to zero.
  1436  * @param   status      A reference to a UErrorCode to receive any errors.
  1438  * @stable ICU 4.0
  1439  */
  1440 U_STABLE void U_EXPORT2
  1441 uregex_setStackLimit(URegularExpression      *regexp,
  1442                      int32_t                  limit,
  1443                      UErrorCode              *status);
  1445 /**
  1446  * Get the size of the heap storage available for use by the back tracking stack.
  1448  * @return  the maximum backtracking stack size, in bytes, or zero if the
  1449  *          stack size is unlimited.
  1450  * @stable ICU 4.0
  1451  */
  1452 U_STABLE int32_t U_EXPORT2
  1453 uregex_getStackLimit(const URegularExpression      *regexp,
  1454                            UErrorCode              *status);
  1457 /**
  1458  * Function pointer for a regular expression matching callback function.
  1459  * When set, a callback function will be called periodically during matching
  1460  * operations.  If the call back function returns FALSE, the matching
  1461  * operation will be terminated early.
  1463  * Note:  the callback function must not call other functions on this
  1464  *        URegularExpression.
  1466  * @param context  context pointer.  The callback function will be invoked
  1467  *                 with the context specified at the time that
  1468  *                 uregex_setMatchCallback() is called.
  1469  * @param steps    the accumulated processing time, in match steps, 
  1470  *                 for this matching operation.
  1471  * @return         TRUE to continue the matching operation.
  1472  *                 FALSE to terminate the matching operation.
  1473  * @stable ICU 4.0
  1474  */
  1475 U_CDECL_BEGIN
  1476 typedef UBool U_CALLCONV URegexMatchCallback (
  1477                    const void *context,
  1478                    int32_t     steps);
  1479 U_CDECL_END
  1481 /**
  1482  * Set a callback function for this URegularExpression.
  1483  * During matching operations the function will be called periodically,
  1484  * giving the application the opportunity to terminate a long-running
  1485  * match.
  1487  * @param   regexp      The compiled regular expression.
  1488  * @param   callback    A pointer to the user-supplied callback function.
  1489  * @param   context     User context pointer.  The value supplied at the
  1490  *                      time the callback function is set will be saved
  1491  *                      and passed to the callback each time that it is called.
  1492  * @param   status      A reference to a UErrorCode to receive any errors.
  1493  * @stable ICU 4.0
  1494  */
  1495 U_STABLE void U_EXPORT2
  1496 uregex_setMatchCallback(URegularExpression      *regexp,
  1497                         URegexMatchCallback     *callback,
  1498                         const void              *context,
  1499                         UErrorCode              *status);
  1502 /**
  1503  *  Get the callback function for this URegularExpression.
  1505  * @param   regexp      The compiled regular expression.
  1506  * @param   callback    Out parameter, receives a pointer to the user-supplied 
  1507  *                      callback function.
  1508  * @param   context     Out parameter, receives the user context pointer that
  1509  *                      was set when uregex_setMatchCallback() was called.
  1510  * @param   status      A reference to a UErrorCode to receive any errors.
  1511  * @stable ICU 4.0
  1512  */
  1513 U_STABLE void U_EXPORT2
  1514 uregex_getMatchCallback(const URegularExpression    *regexp,
  1515                         URegexMatchCallback        **callback,
  1516                         const void                 **context,
  1517                         UErrorCode                  *status);
  1519 /**
  1520  * Function pointer for a regular expression find callback function.
  1522  * When set, a callback function will be called during a find operation
  1523  * and for operations that depend on find, such as findNext, split and some replace
  1524  * operations like replaceFirst.
  1525  * The callback will usually be called after each attempt at a match, but this is not a
  1526  * guarantee that the callback will be invoked at each character.  For finds where the
  1527  * match engine is invoked at each character, this may be close to true, but less likely
  1528  * for more optimized loops where the pattern is known to only start, and the match
  1529  * engine invoked, at certain characters.
  1530  * When invoked, this callback will specify the index at which a match operation is about
  1531  * to be attempted, giving the application the opportunity to terminate a long-running
  1532  * find operation.
  1534  * If the call back function returns FALSE, the find operation will be terminated early.
  1536  * Note:  the callback function must not call other functions on this
  1537  *        URegularExpression
  1539  * @param context  context pointer.  The callback function will be invoked
  1540  *                 with the context specified at the time that
  1541  *                 uregex_setFindProgressCallback() is called.
  1542  * @param matchIndex  the next index at which a match attempt will be attempted for this
  1543  *                 find operation.  If this callback interrupts the search, this is the
  1544  *                 index at which a find/findNext operation may be re-initiated.
  1545  * @return         TRUE to continue the matching operation.
  1546  *                 FALSE to terminate the matching operation.
  1547  * @stable ICU 4.6
  1548  */
  1549 U_CDECL_BEGIN
  1550 typedef UBool U_CALLCONV URegexFindProgressCallback (
  1551                    const void *context,
  1552                    int64_t     matchIndex);
  1553 U_CDECL_END
  1556 /**
  1557  *  Set the find progress callback function for this URegularExpression.
  1559  * @param   regexp      The compiled regular expression.
  1560  * @param   callback    A pointer to the user-supplied callback function.
  1561  * @param   context     User context pointer.  The value supplied at the
  1562  *                      time the callback function is set will be saved
  1563  *                      and passed to the callback each time that it is called.
  1564  * @param   status      A reference to a UErrorCode to receive any errors.
  1565  * @stable ICU 4.6
  1566  */
  1567 U_STABLE void U_EXPORT2
  1568 uregex_setFindProgressCallback(URegularExpression              *regexp,
  1569                                 URegexFindProgressCallback      *callback,
  1570                                 const void                      *context,
  1571                                 UErrorCode                      *status);
  1573 /**
  1574  *  Get the find progress callback function for this URegularExpression.
  1576  * @param   regexp      The compiled regular expression.
  1577  * @param   callback    Out parameter, receives a pointer to the user-supplied 
  1578  *                      callback function.
  1579  * @param   context     Out parameter, receives the user context pointer that
  1580  *                      was set when uregex_setFindProgressCallback() was called.
  1581  * @param   status      A reference to a UErrorCode to receive any errors.
  1582  * @stable ICU 4.6
  1583  */
  1584 U_STABLE void U_EXPORT2
  1585 uregex_getFindProgressCallback(const URegularExpression          *regexp,
  1586                                 URegexFindProgressCallback        **callback,
  1587                                 const void                        **context,
  1588                                 UErrorCode                        *status);
  1590 #endif   /*  !UCONFIG_NO_REGULAR_EXPRESSIONS  */
  1591 #endif   /*  UREGEX_H  */

mercurial