intl/icu/source/i18n/unicode/regex.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2002-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   file name:  regex.h
     7 *   encoding:   US-ASCII
     8 *   indentation:4
     9 *
    10 *   created on: 2002oct22
    11 *   created by: Andy Heninger
    12 *
    13 *   ICU Regular Expressions, API for C++
    14 */
    16 #ifndef REGEX_H
    17 #define REGEX_H
    19 //#define REGEX_DEBUG
    21 /**
    22  * \file
    23  * \brief  C++ API:  Regular Expressions
    24  *
    25  * <h2>Regular Expression API</h2>
    26  *
    27  * <p>The ICU API for processing regular expressions consists of two classes,
    28  *  <code>RegexPattern</code> and <code>RegexMatcher</code>.
    29  *  <code>RegexPattern</code> objects represent a pre-processed, or compiled
    30  *  regular expression.  They are created from a regular expression pattern string,
    31  *  and can be used to create <code>RegexMatcher</code> objects for the pattern.</p>
    32  *
    33  * <p>Class <code>RegexMatcher</code> bundles together a regular expression
    34  *  pattern and a target string to which the search pattern will be applied.
    35  *  <code>RegexMatcher</code> includes API for doing plain find or search
    36  *  operations, for search and replace operations, and for obtaining detailed
    37  *  information about bounds of a match. </p>
    38  *
    39  * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
    40  * expression pattern strings application code can be simplified and the explicit
    41  * need for <code>RegexPattern</code> objects can usually be eliminated.
    42  * </p>
    43  */
    45 #include "unicode/utypes.h"
    47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    49 #include "unicode/uobject.h"
    50 #include "unicode/unistr.h"
    51 #include "unicode/utext.h"
    52 #include "unicode/parseerr.h"
    54 #include "unicode/uregex.h"
    56 // Forward Declarations
    58 U_NAMESPACE_BEGIN
    60 struct Regex8BitSet;
    61 class  RegexCImpl;
    62 class  RegexMatcher;
    63 class  RegexPattern;
    64 struct REStackFrame;
    65 class  RuleBasedBreakIterator;
    66 class  UnicodeSet;
    67 class  UVector;
    68 class  UVector32;
    69 class  UVector64;
    71 #ifndef U_HIDE_INTERNAL_API
    72 /**
    73  *   RBBIPatternDump   Debug function, displays the compiled form of a pattern.
    74  *   @internal
    75  */
    76 #ifdef REGEX_DEBUG
    77 U_INTERNAL void U_EXPORT2
    78     RegexPatternDump(const RegexPattern *pat);
    79 #else
    80     #undef RegexPatternDump
    81     #define RegexPatternDump(pat)
    82 #endif
    83 #endif  /* U_HIDE_INTERNAL_API */
    87 /**
    88   * Class <code>RegexPattern</code> represents a compiled regular expression.  It includes
    89   * factory methods for creating a RegexPattern object from the source (string) form
    90   * of a regular expression, methods for creating RegexMatchers that allow the pattern
    91   * to be applied to input text, and a few convenience methods for simple common
    92   * uses of regular expressions.
    93   *
    94   * <p>Class RegexPattern is not intended to be subclassed.</p>
    95   *
    96   * @stable ICU 2.4
    97   */
    98 class U_I18N_API RegexPattern: public UObject {
    99 public:
   101     /**
   102      * default constructor.  Create a RegexPattern object that refers to no actual
   103      *   pattern.  Not normally needed; RegexPattern objects are usually
   104      *   created using the factory method <code>compile()</code>.
   105      *
   106      * @stable ICU 2.4
   107      */
   108     RegexPattern();
   110     /**
   111      * Copy Constructor.  Create a new RegexPattern object that is equivalent
   112      *                    to the source object.
   113      * @param source the pattern object to be copied.
   114      * @stable ICU 2.4
   115      */
   116     RegexPattern(const RegexPattern &source);
   118     /**
   119      * Destructor.  Note that a RegexPattern object must persist so long as any
   120      *  RegexMatcher objects that were created from the RegexPattern are active.
   121      * @stable ICU 2.4
   122      */
   123     virtual ~RegexPattern();
   125     /**
   126      * Comparison operator.  Two RegexPattern objects are considered equal if they
   127      * were constructed from identical source patterns using the same match flag
   128      * settings.
   129      * @param that a RegexPattern object to compare with "this".
   130      * @return TRUE if the objects are equivalent.
   131      * @stable ICU 2.4
   132      */
   133     UBool           operator==(const RegexPattern& that) const;
   135     /**
   136      * Comparison operator.  Two RegexPattern objects are considered equal if they
   137      * were constructed from identical source patterns using the same match flag
   138      * settings.
   139      * @param that a RegexPattern object to compare with "this".
   140      * @return TRUE if the objects are different.
   141      * @stable ICU 2.4
   142      */
   143     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
   145     /**
   146      * Assignment operator.  After assignment, this RegexPattern will behave identically
   147      *     to the source object.
   148      * @stable ICU 2.4
   149      */
   150     RegexPattern  &operator =(const RegexPattern &source);
   152     /**
   153      * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
   154      * intended to be subclasses, <code>clone()</code> and the copy construction are
   155      * equivalent operations.
   156      * @return the copy of this RegexPattern
   157      * @stable ICU 2.4
   158      */
   159     virtual RegexPattern  *clone() const;
   162    /**
   163     * Compiles the regular expression in string form into a RegexPattern
   164     * object.  These compile methods, rather than the constructors, are the usual
   165     * way that RegexPattern objects are created.
   166     *
   167     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   168     * objects created from the pattern are active.  RegexMatchers keep a pointer
   169     * back to their pattern, so premature deletion of the pattern is a
   170     * catastrophic error.</p>
   171     *
   172     * <p>All pattern match mode flags are set to their default values.</p>
   173     *
   174     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   175     *    from a pattern string rather than separately compiling the pattern and
   176     *    then creating a RegexMatcher object from the pattern.</p>
   177     *
   178     * @param regex The regular expression to be compiled.
   179     * @param pe    Receives the position (line and column nubers) of any error
   180     *              within the regular expression.)
   181     * @param status A reference to a UErrorCode to receive any errors.
   182     * @return      A regexPattern object for the compiled pattern.
   183     *
   184     * @stable ICU 2.4
   185     */
   186     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
   187         UParseError          &pe,
   188         UErrorCode           &status);
   190    /**
   191     * Compiles the regular expression in string form into a RegexPattern
   192     * object.  These compile methods, rather than the constructors, are the usual
   193     * way that RegexPattern objects are created.
   194     *
   195     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   196     * objects created from the pattern are active.  RegexMatchers keep a pointer
   197     * back to their pattern, so premature deletion of the pattern is a
   198     * catastrophic error.</p>
   199     *
   200     * <p>All pattern match mode flags are set to their default values.</p>
   201     *
   202     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   203     *    from a pattern string rather than separately compiling the pattern and
   204     *    then creating a RegexMatcher object from the pattern.</p>
   205     *
   206     * @param regex The regular expression to be compiled. Note, the text referred
   207     *              to by this UText must not be deleted during the lifetime of the
   208     *              RegexPattern object or any RegexMatcher object created from it.
   209     * @param pe    Receives the position (line and column nubers) of any error
   210     *              within the regular expression.)
   211     * @param status A reference to a UErrorCode to receive any errors.
   212     * @return      A regexPattern object for the compiled pattern.
   213     *
   214     * @stable ICU 4.6
   215     */
   216     static RegexPattern * U_EXPORT2 compile( UText *regex,
   217         UParseError          &pe,
   218         UErrorCode           &status);
   220    /**
   221     * Compiles the regular expression in string form into a RegexPattern
   222     * object using the specified match mode flags.  These compile methods,
   223     * rather than the constructors, are the usual way that RegexPattern objects
   224     * are created.
   225     *
   226     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   227     * objects created from the pattern are active.  RegexMatchers keep a pointer
   228     * back to their pattern, so premature deletion of the pattern is a
   229     * catastrophic error.</p>
   230     *
   231     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   232     *    from a pattern string instead of than separately compiling the pattern and
   233     *    then creating a RegexMatcher object from the pattern.</p>
   234     *
   235     * @param regex The regular expression to be compiled.
   236     * @param flags The match mode flags to be used.
   237     * @param pe    Receives the position (line and column numbers) of any error
   238     *              within the regular expression.)
   239     * @param status   A reference to a UErrorCode to receive any errors.
   240     * @return      A regexPattern object for the compiled pattern.
   241     *
   242     * @stable ICU 2.4
   243     */
   244     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
   245         uint32_t             flags,
   246         UParseError          &pe,
   247         UErrorCode           &status);
   249    /**
   250     * Compiles the regular expression in string form into a RegexPattern
   251     * object using the specified match mode flags.  These compile methods,
   252     * rather than the constructors, are the usual way that RegexPattern objects
   253     * are created.
   254     *
   255     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   256     * objects created from the pattern are active.  RegexMatchers keep a pointer
   257     * back to their pattern, so premature deletion of the pattern is a
   258     * catastrophic error.</p>
   259     *
   260     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   261     *    from a pattern string instead of than separately compiling the pattern and
   262     *    then creating a RegexMatcher object from the pattern.</p>
   263     *
   264     * @param regex The regular expression to be compiled. Note, the text referred
   265     *              to by this UText must not be deleted during the lifetime of the
   266     *              RegexPattern object or any RegexMatcher object created from it.
   267     * @param flags The match mode flags to be used.
   268     * @param pe    Receives the position (line and column numbers) of any error
   269     *              within the regular expression.)
   270     * @param status   A reference to a UErrorCode to receive any errors.
   271     * @return      A regexPattern object for the compiled pattern.
   272     *
   273     * @stable ICU 4.6
   274     */
   275     static RegexPattern * U_EXPORT2 compile( UText *regex,
   276         uint32_t             flags,
   277         UParseError          &pe,
   278         UErrorCode           &status);
   280    /**
   281     * Compiles the regular expression in string form into a RegexPattern
   282     * object using the specified match mode flags.  These compile methods,
   283     * rather than the constructors, are the usual way that RegexPattern objects
   284     * are created.
   285     *
   286     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   287     * objects created from the pattern are active.  RegexMatchers keep a pointer
   288     * back to their pattern, so premature deletion of the pattern is a
   289     * catastrophic error.</p>
   290     *
   291     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   292     *    from a pattern string instead of than separately compiling the pattern and
   293     *    then creating a RegexMatcher object from the pattern.</p>
   294     *
   295     * @param regex The regular expression to be compiled.
   296     * @param flags The match mode flags to be used.
   297     * @param status   A reference to a UErrorCode to receive any errors.
   298     * @return      A regexPattern object for the compiled pattern.
   299     *
   300     * @stable ICU 2.6
   301     */
   302     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
   303         uint32_t             flags,
   304         UErrorCode           &status);
   306    /**
   307     * Compiles the regular expression in string form into a RegexPattern
   308     * object using the specified match mode flags.  These compile methods,
   309     * rather than the constructors, are the usual way that RegexPattern objects
   310     * are created.
   311     *
   312     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
   313     * objects created from the pattern are active.  RegexMatchers keep a pointer
   314     * back to their pattern, so premature deletion of the pattern is a
   315     * catastrophic error.</p>
   316     *
   317     * <p>Note that it is often more convenient to construct a RegexMatcher directly
   318     *    from a pattern string instead of than separately compiling the pattern and
   319     *    then creating a RegexMatcher object from the pattern.</p>
   320     *
   321     * @param regex The regular expression to be compiled. Note, the text referred
   322     *              to by this UText must not be deleted during the lifetime of the
   323     *              RegexPattern object or any RegexMatcher object created from it.
   324     * @param flags The match mode flags to be used.
   325     * @param status   A reference to a UErrorCode to receive any errors.
   326     * @return      A regexPattern object for the compiled pattern.
   327     *
   328     * @stable ICU 4.6
   329     */
   330     static RegexPattern * U_EXPORT2 compile( UText *regex,
   331         uint32_t             flags,
   332         UErrorCode           &status);
   334    /**
   335     * Get the match mode flags that were used when compiling this pattern.
   336     * @return  the match mode flags
   337     * @stable ICU 2.4
   338     */
   339     virtual uint32_t flags() const;
   341    /**
   342     * Creates a RegexMatcher that will match the given input against this pattern.  The
   343     * RegexMatcher can then be used to perform match, find or replace operations
   344     * on the input.  Note that a RegexPattern object must not be deleted while
   345     * RegexMatchers created from it still exist and might possibly be used again.
   346     * <p>
   347     * The matcher will retain a reference to the supplied input string, and all regexp
   348     * pattern matching operations happen directly on this original string.  It is
   349     * critical that the string not be altered or deleted before use by the regular
   350     * expression operations is complete.
   351     *
   352     * @param input    The input string to which the regular expression will be applied.
   353     * @param status   A reference to a UErrorCode to receive any errors.
   354     * @return         A RegexMatcher object for this pattern and input.
   355     *
   356     * @stable ICU 2.4
   357     */
   358     virtual RegexMatcher *matcher(const UnicodeString &input,
   359         UErrorCode          &status) const;
   361 private:
   362     /**
   363      * Cause a compilation error if an application accidentally attempts to
   364      *   create a matcher with a (UChar *) string as input rather than
   365      *   a UnicodeString.  Avoids a dangling reference to a temporary string.
   366      * <p>
   367      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
   368      * using one of the aliasing constructors, such as
   369      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
   370      * or in a UText, using
   371      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
   372      *
   373      */
   374     RegexMatcher *matcher(const UChar *input,
   375         UErrorCode          &status) const;
   376 public:
   379    /**
   380     * Creates a RegexMatcher that will match against this pattern.  The
   381     * RegexMatcher can be used to perform match, find or replace operations.
   382     * Note that a RegexPattern object must not be deleted while
   383     * RegexMatchers created from it still exist and might possibly be used again.
   384     *
   385     * @param status   A reference to a UErrorCode to receive any errors.
   386     * @return      A RegexMatcher object for this pattern and input.
   387     *
   388     * @stable ICU 2.6
   389     */
   390     virtual RegexMatcher *matcher(UErrorCode  &status) const;
   393    /**
   394     * Test whether a string matches a regular expression.  This convenience function
   395     * both compiles the regular expression and applies it in a single operation.
   396     * Note that if the same pattern needs to be applied repeatedly, this method will be
   397     * less efficient than creating and reusing a RegexMatcher object.
   398     *
   399     * @param regex The regular expression
   400     * @param input The string data to be matched
   401     * @param pe Receives the position of any syntax errors within the regular expression
   402     * @param status A reference to a UErrorCode to receive any errors.
   403     * @return True if the regular expression exactly matches the full input string.
   404     *
   405     * @stable ICU 2.4
   406     */
   407     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
   408         const UnicodeString   &input,
   409               UParseError     &pe,
   410               UErrorCode      &status);
   412    /**
   413     * Test whether a string matches a regular expression.  This convenience function
   414     * both compiles the regular expression and applies it in a single operation.
   415     * Note that if the same pattern needs to be applied repeatedly, this method will be
   416     * less efficient than creating and reusing a RegexMatcher object.
   417     *
   418     * @param regex The regular expression
   419     * @param input The string data to be matched
   420     * @param pe Receives the position of any syntax errors within the regular expression
   421     * @param status A reference to a UErrorCode to receive any errors.
   422     * @return True if the regular expression exactly matches the full input string.
   423     *
   424     * @stable ICU 4.6
   425     */
   426     static UBool U_EXPORT2 matches(UText *regex,
   427         UText           *input,
   428         UParseError     &pe,
   429         UErrorCode      &status);
   431    /**
   432     * Returns the regular expression from which this pattern was compiled. This method will work
   433     * even if the pattern was compiled from a UText.
   434     *
   435     * Note: If the pattern was originally compiled from a UText, and that UText was modified,
   436     * the returned string may no longer reflect the RegexPattern object.
   437     * @stable ICU 2.4
   438     */
   439     virtual UnicodeString pattern() const;
   442    /**
   443     * Returns the regular expression from which this pattern was compiled. This method will work
   444     * even if the pattern was compiled from a UnicodeString.
   445     *
   446     * Note: This is the original input, not a clone. If the pattern was originally compiled from a
   447     * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
   448     * object.
   449     *
   450     * @stable ICU 4.6
   451     */
   452     virtual UText *patternText(UErrorCode      &status) const;
   455     /**
   456      * Split a string into fields.  Somewhat like split() from Perl or Java.
   457      * Pattern matches identify delimiters that separate the input
   458      * into fields.  The input data between the delimiters becomes the
   459      * fields themselves.
   460      *
   461      * If the delimiter pattern includes capture groups, the captured text will
   462      * also appear in the destination array of output strings, interspersed
   463      * with the fields.  This is similar to Perl, but differs from Java, 
   464      * which ignores the presence of capture groups in the pattern.
   465      * 
   466      * Trailing empty fields will always be returned, assuming sufficient
   467      * destination capacity.  This differs from the default behavior for Java
   468      * and Perl where trailing empty fields are not returned.
   469      *
   470      * The number of strings produced by the split operation is returned.
   471      * This count includes the strings from capture groups in the delimiter pattern.
   472      * This behavior differs from Java, which ignores capture groups.
   473      *
   474      * For the best performance on split() operations,
   475      * <code>RegexMatcher::split</code> is preferable to this function
   476      *
   477      * @param input   The string to be split into fields.  The field delimiters
   478      *                match the pattern (in the "this" object)
   479      * @param dest    An array of UnicodeStrings to receive the results of the split.
   480      *                This is an array of actual UnicodeString objects, not an
   481      *                array of pointers to strings.  Local (stack based) arrays can
   482      *                work well here.
   483      * @param destCapacity  The number of elements in the destination array.
   484      *                If the number of fields found is less than destCapacity, the
   485      *                extra strings in the destination array are not altered.
   486      *                If the number of destination strings is less than the number
   487      *                of fields, the trailing part of the input string, including any
   488      *                field delimiters, is placed in the last destination string.
   489      * @param status  A reference to a UErrorCode to receive any errors.
   490      * @return        The number of fields into which the input string was split.
   491      * @stable ICU 2.4
   492      */
   493     virtual int32_t  split(const UnicodeString &input,
   494         UnicodeString    dest[],
   495         int32_t          destCapacity,
   496         UErrorCode       &status) const;
   499     /**
   500      * Split a string into fields.  Somewhat like split() from Perl or Java.
   501      * Pattern matches identify delimiters that separate the input
   502      * into fields.  The input data between the delimiters becomes the
   503      * fields themselves.
   504      *
   505      * If the delimiter pattern includes capture groups, the captured text will
   506      * also appear in the destination array of output strings, interspersed
   507      * with the fields.  This is similar to Perl, but differs from Java, 
   508      * which ignores the presence of capture groups in the pattern.
   509      * 
   510      * Trailing empty fields will always be returned, assuming sufficient
   511      * destination capacity.  This differs from the default behavior for Java
   512      * and Perl where trailing empty fields are not returned.
   513      *
   514      * The number of strings produced by the split operation is returned.
   515      * This count includes the strings from capture groups in the delimiter pattern.
   516      * This behavior differs from Java, which ignores capture groups.
   517      *
   518      *  For the best performance on split() operations,
   519      *  <code>RegexMatcher::split</code> is preferable to this function
   520      *
   521      * @param input   The string to be split into fields.  The field delimiters
   522      *                match the pattern (in the "this" object)
   523      * @param dest    An array of mutable UText structs to receive the results of the split.
   524      *                If a field is NULL, a new UText is allocated to contain the results for
   525      *                that field. This new UText is not guaranteed to be mutable.
   526      * @param destCapacity  The number of elements in the destination array.
   527      *                If the number of fields found is less than destCapacity, the
   528      *                extra strings in the destination array are not altered.
   529      *                If the number of destination strings is less than the number
   530      *                of fields, the trailing part of the input string, including any
   531      *                field delimiters, is placed in the last destination string.
   532      * @param status  A reference to a UErrorCode to receive any errors.
   533      * @return        The number of destination strings used.  
   534      *
   535      * @stable ICU 4.6
   536      */
   537     virtual int32_t  split(UText *input,
   538         UText            *dest[],
   539         int32_t          destCapacity,
   540         UErrorCode       &status) const;
   543     /**
   544      * ICU "poor man's RTTI", returns a UClassID for the actual class.
   545      *
   546      * @stable ICU 2.4
   547      */
   548     virtual UClassID getDynamicClassID() const;
   550     /**
   551      * ICU "poor man's RTTI", returns a UClassID for this class.
   552      *
   553      * @stable ICU 2.4
   554      */
   555     static UClassID U_EXPORT2 getStaticClassID();
   557 private:
   558     //
   559     //  Implementation Data
   560     //
   561     UText          *fPattern;      // The original pattern string.
   562     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
   563     uint32_t        fFlags;        // The flags used when compiling the pattern.
   564                                    //
   565     UVector64       *fCompiledPat; // The compiled pattern p-code.
   566     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
   567                                    //   after un-escaping, for use during the match.
   569     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
   570     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
   573     UErrorCode      fDeferredStatus; // status if some prior error has left this
   574                                    //  RegexPattern in an unusable state.
   576     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
   577                                    //   >= this value.  For some patterns, this calculated
   578                                    //   value may be less than the true shortest
   579                                    //   possible match.
   581     int32_t         fFrameSize;    // Size of a state stack frame in the
   582                                    //   execution engine.
   584     int32_t         fDataSize;     // The size of the data needed by the pattern that
   585                                    //   does not go on the state stack, but has just
   586                                    //   a single copy per matcher.
   588     UVector32       *fGroupMap;    // Map from capture group number to position of
   589                                    //   the group's variables in the matcher stack frame.
   591     int32_t         fMaxCaptureDigits;
   593     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
   594                                    //   regex character classes, e.g. Word.
   596     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
   597                                    //  sets for predefined regex classes.
   599     int32_t         fStartType;    // Info on how a match must start.
   600     int32_t         fInitialStringIdx;     //
   601     int32_t         fInitialStringLen;
   602     UnicodeSet     *fInitialChars;
   603     UChar32         fInitialChar;
   604     Regex8BitSet   *fInitialChars8;
   605     UBool           fNeedsAltInput;
   607     friend class RegexCompile;
   608     friend class RegexMatcher;
   609     friend class RegexCImpl;
   611     //
   612     //  Implementation Methods
   613     //
   614     void        init();            // Common initialization, for use by constructors.
   615     void        zap();             // Common cleanup
   616 #ifdef REGEX_DEBUG
   617     void        dumpOp(int32_t index) const;
   618     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
   619 #endif
   621 };
   625 /**
   626  *  class RegexMatcher bundles together a regular expression pattern and
   627  *  input text to which the expression can be applied.  It includes methods
   628  *  for testing for matches, and for find and replace operations.
   629  *
   630  * <p>Class RegexMatcher is not intended to be subclassed.</p>
   631  *
   632  * @stable ICU 2.4
   633  */
   634 class U_I18N_API RegexMatcher: public UObject {
   635 public:
   637     /**
   638       * Construct a RegexMatcher for a regular expression.
   639       * This is a convenience method that avoids the need to explicitly create
   640       * a RegexPattern object.  Note that if several RegexMatchers need to be
   641       * created for the same expression, it will be more efficient to
   642       * separately create and cache a RegexPattern object, and use
   643       * its matcher() method to create the RegexMatcher objects.
   644       *
   645       *  @param regexp The Regular Expression to be compiled.
   646       *  @param flags  Regular expression options, such as case insensitive matching.
   647       *                @see UREGEX_CASE_INSENSITIVE
   648       *  @param status Any errors are reported by setting this UErrorCode variable.
   649       *  @stable ICU 2.6
   650       */
   651     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
   653     /**
   654       * Construct a RegexMatcher for a regular expression.
   655       * This is a convenience method that avoids the need to explicitly create
   656       * a RegexPattern object.  Note that if several RegexMatchers need to be
   657       * created for the same expression, it will be more efficient to
   658       * separately create and cache a RegexPattern object, and use
   659       * its matcher() method to create the RegexMatcher objects.
   660       *
   661       *  @param regexp The regular expression to be compiled.
   662       *  @param flags  Regular expression options, such as case insensitive matching.
   663       *                @see UREGEX_CASE_INSENSITIVE
   664       *  @param status Any errors are reported by setting this UErrorCode variable.
   665       *
   666       *  @stable ICU 4.6
   667       */
   668     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
   670     /**
   671       * Construct a RegexMatcher for a regular expression.
   672       * This is a convenience method that avoids the need to explicitly create
   673       * a RegexPattern object.  Note that if several RegexMatchers need to be
   674       * created for the same expression, it will be more efficient to
   675       * separately create and cache a RegexPattern object, and use
   676       * its matcher() method to create the RegexMatcher objects.
   677       * <p>
   678       * The matcher will retain a reference to the supplied input string, and all regexp
   679       * pattern matching operations happen directly on the original string.  It is
   680       * critical that the string not be altered or deleted before use by the regular
   681       * expression operations is complete.
   682       *
   683       *  @param regexp The Regular Expression to be compiled.
   684       *  @param input  The string to match.  The matcher retains a reference to the
   685       *                caller's string; mo copy is made.
   686       *  @param flags  Regular expression options, such as case insensitive matching.
   687       *                @see UREGEX_CASE_INSENSITIVE
   688       *  @param status Any errors are reported by setting this UErrorCode variable.
   689       *  @stable ICU 2.6
   690       */
   691     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
   692         uint32_t flags, UErrorCode &status);
   694     /**
   695       * Construct a RegexMatcher for a regular expression.
   696       * This is a convenience method that avoids the need to explicitly create
   697       * a RegexPattern object.  Note that if several RegexMatchers need to be
   698       * created for the same expression, it will be more efficient to
   699       * separately create and cache a RegexPattern object, and use
   700       * its matcher() method to create the RegexMatcher objects.
   701       * <p>
   702       * The matcher will make a shallow clone of the supplied input text, and all regexp
   703       * pattern matching operations happen on this clone.  While read-only operations on
   704       * the supplied text are permitted, it is critical that the underlying string not be
   705       * altered or deleted before use by the regular expression operations is complete.
   706       *
   707       *  @param regexp The Regular Expression to be compiled.
   708       *  @param input  The string to match.  The matcher retains a shallow clone of the text.
   709       *  @param flags  Regular expression options, such as case insensitive matching.
   710       *                @see UREGEX_CASE_INSENSITIVE
   711       *  @param status Any errors are reported by setting this UErrorCode variable.
   712       *
   713       *  @stable ICU 4.6
   714       */
   715     RegexMatcher(UText *regexp, UText *input,
   716         uint32_t flags, UErrorCode &status);
   718 private:
   719     /**
   720      * Cause a compilation error if an application accidentally attempts to
   721      *   create a matcher with a (UChar *) string as input rather than
   722      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
   723      * <p>
   724      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
   725      * using one of the aliasing constructors, such as
   726      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
   727      * or in a UText, using
   728      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
   729      *
   730      */
   731     RegexMatcher(const UnicodeString &regexp, const UChar *input,
   732         uint32_t flags, UErrorCode &status);
   733 public:
   736    /**
   737     *   Destructor.
   738     *
   739     *  @stable ICU 2.4
   740     */
   741     virtual ~RegexMatcher();
   744    /**
   745     *   Attempts to match the entire input region against the pattern.
   746     *    @param   status     A reference to a UErrorCode to receive any errors.
   747     *    @return TRUE if there is a match
   748     *    @stable ICU 2.4
   749     */
   750     virtual UBool matches(UErrorCode &status);
   753    /**
   754     *   Resets the matcher, then attempts to match the input beginning 
   755     *   at the specified startIndex, and extending to the end of the input.
   756     *   The input region is reset to include the entire input string.
   757     *   A successful match must extend to the end of the input.
   758     *    @param   startIndex The input string (native) index at which to begin matching.
   759     *    @param   status     A reference to a UErrorCode to receive any errors.
   760     *    @return TRUE if there is a match
   761     *    @stable ICU 2.8
   762     */
   763     virtual UBool matches(int64_t startIndex, UErrorCode &status);
   766    /**
   767     *   Attempts to match the input string, starting from the beginning of the region,
   768     *   against the pattern.  Like the matches() method, this function 
   769     *   always starts at the beginning of the input region;
   770     *   unlike that function, it does not require that the entire region be matched.
   771     *
   772     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
   773     *     <code>end()</code>, and <code>group()</code> functions.</p>
   774     *
   775     *    @param   status     A reference to a UErrorCode to receive any errors.
   776     *    @return  TRUE if there is a match at the start of the input string.
   777     *    @stable ICU 2.4
   778     */
   779     virtual UBool lookingAt(UErrorCode &status);
   782   /**
   783     *   Attempts to match the input string, starting from the specified index, against the pattern.
   784     *   The match may be of any length, and is not required to extend to the end
   785     *   of the input string.  Contrast with match().
   786     *
   787     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
   788     *     <code>end()</code>, and <code>group()</code> functions.</p>
   789     *
   790     *    @param   startIndex The input string (native) index at which to begin matching.
   791     *    @param   status     A reference to a UErrorCode to receive any errors.
   792     *    @return  TRUE if there is a match.
   793     *    @stable ICU 2.8
   794     */
   795     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
   798    /**
   799     *  Find the next pattern match in the input string.
   800     *  The find begins searching the input at the location following the end of
   801     *  the previous match, or at the start of the string if there is no previous match.
   802     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
   803     *  will provide more information regarding the match.
   804     *  <p>Note that if the input string is changed by the application,
   805     *     use find(startPos, status) instead of find(), because the saved starting
   806     *     position may not be valid with the altered input string.</p>
   807     *  @return  TRUE if a match is found.
   808     *  @stable ICU 2.4
   809     */
   810     virtual UBool find();
   813    /**
   814     *   Resets this RegexMatcher and then attempts to find the next substring of the
   815     *   input string that matches the pattern, starting at the specified index.
   816     *
   817     *   @param   start     The (native) index in the input string to begin the search.
   818     *   @param   status    A reference to a UErrorCode to receive any errors.
   819     *   @return  TRUE if a match is found.
   820     *   @stable ICU 2.4
   821     */
   822     virtual UBool find(int64_t start, UErrorCode &status);
   825    /**
   826     *   Returns a string containing the text matched by the previous match.
   827     *   If the pattern can match an empty string, an empty string may be returned.
   828     *   @param   status      A reference to a UErrorCode to receive any errors.
   829     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
   830     *                        has been attempted or the last match failed.
   831     *   @return  a string containing the matched input text.
   832     *   @stable ICU 2.4
   833     */
   834     virtual UnicodeString group(UErrorCode &status) const;
   837    /**
   838     *    Returns a string containing the text captured by the given group
   839     *    during the previous match operation.  Group(0) is the entire match.
   840     *
   841     *    @param groupNum the capture group number
   842     *    @param   status     A reference to a UErrorCode to receive any errors.
   843     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
   844     *                        has been attempted or the last match failed and
   845     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
   846     *    @return the captured text
   847     *    @stable ICU 2.4
   848     */
   849     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
   852    /**
   853     *   Returns the number of capturing groups in this matcher's pattern.
   854     *   @return the number of capture groups
   855     *   @stable ICU 2.4
   856     */
   857     virtual int32_t groupCount() const;
   860    /**
   861     *   Returns a shallow clone of the entire live input string with the UText current native index
   862     *   set to the beginning of the requested group.
   863     *
   864     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText
   865     *   @param   group_len   A reference to receive the length of the desired capture group
   866     *   @param   status      A reference to a UErrorCode to receive any errors.
   867     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
   868     *                        has been attempted or the last match failed and
   869     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
   870     *   @return dest if non-NULL, a shallow copy of the input text otherwise
   871     *
   872     *   @stable ICU 4.6
   873     */
   874     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
   876    /**
   877     *   Returns a shallow clone of the entire live input string with the UText current native index
   878     *   set to the beginning of the requested group.
   879     *
   880     *   @param   groupNum   The capture group number.
   881     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText.
   882     *   @param   group_len   A reference to receive the length of the desired capture group
   883     *   @param   status      A reference to a UErrorCode to receive any errors.
   884     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
   885     *                        has been attempted or the last match failed and
   886     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
   887     *   @return dest if non-NULL, a shallow copy of the input text otherwise
   888     *
   889     *   @stable ICU 4.6
   890     */
   891     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
   893    /**
   894     *   Returns a string containing the text captured by the given group
   895     *   during the previous match operation.  Group(0) is the entire match.
   896     *
   897     *   @param   groupNum    the capture group number
   898     *   @param   dest        A mutable UText in which the matching text is placed.
   899     *                        If NULL, a new UText will be created (which may not be mutable).
   900     *   @param   status      A reference to a UErrorCode to receive any errors.
   901     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
   902     *                        has been attempted or the last match failed.
   903     *   @return  A string containing the matched input text. If a pre-allocated UText
   904     *            was provided, it will always be used and returned.
   905     *
   906     *   @internal ICU 4.4 technology preview
   907     */
   908     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
   911    /**
   912     *   Returns the index in the input string of the start of the text matched
   913     *   during the previous match operation.
   914     *    @param   status      a reference to a UErrorCode to receive any errors.
   915     *    @return              The (native) position in the input string of the start of the last match.
   916     *    @stable ICU 2.4
   917     */
   918     virtual int32_t start(UErrorCode &status) const;
   920    /**
   921     *   Returns the index in the input string of the start of the text matched
   922     *   during the previous match operation.
   923     *    @param   status      a reference to a UErrorCode to receive any errors.
   924     *    @return              The (native) position in the input string of the start of the last match.
   925     *   @stable ICU 4.6
   926     */
   927     virtual int64_t start64(UErrorCode &status) const;
   930    /**
   931     *   Returns the index in the input string of the start of the text matched by the
   932     *    specified capture group during the previous match operation.  Return -1 if
   933     *    the capture group exists in the pattern, but was not part of the last match.
   934     *
   935     *    @param  group       the capture group number
   936     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
   937     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   938     *                        attempted or the last match failed, and
   939     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
   940     *    @return the (native) start position of substring matched by the specified group.
   941     *    @stable ICU 2.4
   942     */
   943     virtual int32_t start(int32_t group, UErrorCode &status) const;
   945    /**
   946     *   Returns the index in the input string of the start of the text matched by the
   947     *    specified capture group during the previous match operation.  Return -1 if
   948     *    the capture group exists in the pattern, but was not part of the last match.
   949     *
   950     *    @param  group       the capture group number.
   951     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
   952     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   953     *                        attempted or the last match failed, and
   954     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
   955     *    @return the (native) start position of substring matched by the specified group.
   956     *    @stable ICU 4.6
   957     */
   958     virtual int64_t start64(int32_t group, UErrorCode &status) const;
   961    /**
   962     *    Returns the index in the input string of the first character following the
   963     *    text matched during the previous match operation.
   964     *
   965     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   966     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   967     *                        attempted or the last match failed.
   968     *    @return the index of the last character matched, plus one.
   969     *                        The index value returned is a native index, corresponding to
   970     *                        code units for the underlying encoding type, for example,
   971     *                        a byte index for UTF-8.
   972     *   @stable ICU 2.4
   973     */
   974     virtual int32_t end(UErrorCode &status) const;
   976    /**
   977     *    Returns the index in the input string of the first character following the
   978     *    text matched during the previous match operation.
   979     *
   980     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
   981     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   982     *                        attempted or the last match failed.
   983     *    @return the index of the last character matched, plus one.
   984     *                        The index value returned is a native index, corresponding to
   985     *                        code units for the underlying encoding type, for example,
   986     *                        a byte index for UTF-8.
   987     *   @stable ICU 4.6
   988     */
   989     virtual int64_t end64(UErrorCode &status) const;
   992    /**
   993     *    Returns the index in the input string of the character following the
   994     *    text matched by the specified capture group during the previous match operation.
   995     *
   996     *    @param group  the capture group number
   997     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
   998     *                        errors are  U_REGEX_INVALID_STATE if no match has been
   999     *                        attempted or the last match failed and
  1000     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  1001     *    @return  the index of the first character following the text
  1002     *              captured by the specified group during the previous match operation.
  1003     *              Return -1 if the capture group exists in the pattern but was not part of the match.
  1004     *              The index value returned is a native index, corresponding to
  1005     *              code units for the underlying encoding type, for example,
  1006     *              a byte index for UTF8.
  1007     *    @stable ICU 2.4
  1008     */
  1009     virtual int32_t end(int32_t group, UErrorCode &status) const;
  1011    /**
  1012     *    Returns the index in the input string of the character following the
  1013     *    text matched by the specified capture group during the previous match operation.
  1015     *    @param group  the capture group number
  1016     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
  1017     *                        errors are  U_REGEX_INVALID_STATE if no match has been
  1018     *                        attempted or the last match failed and
  1019     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  1020     *    @return  the index of the first character following the text
  1021     *              captured by the specified group during the previous match operation.
  1022     *              Return -1 if the capture group exists in the pattern but was not part of the match.
  1023     *              The index value returned is a native index, corresponding to
  1024     *              code units for the underlying encoding type, for example,
  1025     *              a byte index for UTF8.
  1026     *   @stable ICU 4.6
  1027     */
  1028     virtual int64_t end64(int32_t group, UErrorCode &status) const;
  1031    /**
  1032     *   Resets this matcher.  The effect is to remove any memory of previous matches,
  1033     *       and to cause subsequent find() operations to begin at the beginning of
  1034     *       the input string.
  1036     *   @return this RegexMatcher.
  1037     *   @stable ICU 2.4
  1038     */
  1039     virtual RegexMatcher &reset();
  1042    /**
  1043     *   Resets this matcher, and set the current input position.
  1044     *   The effect is to remove any memory of previous matches,
  1045     *       and to cause subsequent find() operations to begin at
  1046     *       the specified (native) position in the input string.
  1047     * <p>
  1048     *   The matcher's region is reset to its default, which is the entire
  1049     *   input string.
  1050     * <p>
  1051     *   An alternative to this function is to set a match region
  1052     *   beginning at the desired index.
  1054     *   @return this RegexMatcher.
  1055     *   @stable ICU 2.8
  1056     */
  1057     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
  1060    /**
  1061     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
  1062     *     to be reused, which is more efficient than creating a new RegexMatcher for
  1063     *     each input string to be processed.
  1064     *   @param input The new string on which subsequent pattern matches will operate.
  1065     *                The matcher retains a reference to the callers string, and operates
  1066     *                directly on that.  Ownership of the string remains with the caller.
  1067     *                Because no copy of the string is made, it is essential that the
  1068     *                caller not delete the string until after regexp operations on it
  1069     *                are done.
  1070     *                Note that while a reset on the matcher with an input string that is then
  1071     *                modified across/during matcher operations may be supported currently for UnicodeString,
  1072     *                this was not originally intended behavior, and support for this is not guaranteed
  1073     *                in upcoming versions of ICU.
  1074     *   @return this RegexMatcher.
  1075     *   @stable ICU 2.4
  1076     */
  1077     virtual RegexMatcher &reset(const UnicodeString &input);
  1080    /**
  1081     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
  1082     *     to be reused, which is more efficient than creating a new RegexMatcher for
  1083     *     each input string to be processed.
  1084     *   @param input The new string on which subsequent pattern matches will operate.
  1085     *                The matcher makes a shallow clone of the given text; ownership of the
  1086     *                original string remains with the caller. Because no deep copy of the
  1087     *                text is made, it is essential that the caller not modify the string
  1088     *                until after regexp operations on it are done.
  1089     *   @return this RegexMatcher.
  1091     *   @stable ICU 4.6
  1092     */
  1093     virtual RegexMatcher &reset(UText *input);
  1096   /**
  1097     *  Set the subject text string upon which the regular expression is looking for matches
  1098     *  without changing any other aspect of the matching state.
  1099     *  The new and previous text strings must have the same content.
  1101     *  This function is intended for use in environments where ICU is operating on 
  1102     *  strings that may move around in memory.  It provides a mechanism for notifying
  1103     *  ICU that the string has been relocated, and providing a new UText to access the
  1104     *  string in its new position.
  1106     *  Note that the regular expression implementation never copies the underlying text
  1107     *  of a string being matched, but always operates directly on the original text 
  1108     *  provided by the user. Refreshing simply drops the references to the old text 
  1109     *  and replaces them with references to the new.
  1111     *  Caution:  this function is normally used only by very specialized,
  1112     *  system-level code.  One example use case is with garbage collection that moves
  1113     *  the text in memory.
  1115     * @param input      The new (moved) text string.
  1116     * @param status     Receives errors detected by this function.
  1118     * @stable ICU 4.8 
  1119     */
  1120     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
  1122 private:
  1123     /**
  1124      * Cause a compilation error if an application accidentally attempts to
  1125      *   reset a matcher with a (UChar *) string as input rather than
  1126      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
  1127      * <p>
  1128      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
  1129      * using one of the aliasing constructors, such as
  1130      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
  1131      * or in a UText, using
  1132      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
  1134      */
  1135     RegexMatcher &reset(const UChar *input);
  1136 public:
  1138    /**
  1139     *   Returns the input string being matched.  Ownership of the string belongs to
  1140     *   the matcher; it should not be altered or deleted. This method will work even if the input
  1141     *   was originally supplied as a UText.
  1142     *   @return the input string
  1143     *   @stable ICU 2.4
  1144     */
  1145     virtual const UnicodeString &input() const;
  1147    /**
  1148     *   Returns the input string being matched.  This is the live input text; it should not be
  1149     *   altered or deleted. This method will work even if the input was originally supplied as
  1150     *   a UnicodeString.
  1151     *   @return the input text
  1153     *   @stable ICU 4.6
  1154     */
  1155     virtual UText *inputText() const;
  1157    /**
  1158     *   Returns the input string being matched, either by copying it into the provided
  1159     *   UText parameter or by returning a shallow clone of the live input. Note that copying
  1160     *   the entire input may cause significant performance and memory issues.
  1161     *   @param dest The UText into which the input should be copied, or NULL to create a new UText
  1162     *   @param status error code
  1163     *   @return dest if non-NULL, a shallow copy of the input text otherwise
  1165     *   @stable ICU 4.6
  1166     */
  1167     virtual UText *getInput(UText *dest, UErrorCode &status) const;
  1170    /** Sets the limits of this matcher's region.
  1171      * The region is the part of the input string that will be searched to find a match.
  1172      * Invoking this method resets the matcher, and then sets the region to start
  1173      * at the index specified by the start parameter and end at the index specified
  1174      * by the end parameter.
  1176      * Depending on the transparency and anchoring being used (see useTransparentBounds
  1177      * and useAnchoringBounds), certain constructs such as anchors may behave differently
  1178      * at or around the boundaries of the region
  1180      * The function will fail if start is greater than limit, or if either index
  1181      *  is less than zero or greater than the length of the string being matched.
  1183      * @param start  The (native) index to begin searches at.
  1184      * @param limit  The index to end searches at (exclusive).
  1185      * @param status A reference to a UErrorCode to receive any errors.
  1186      * @stable ICU 4.0
  1187      */
  1188      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
  1190    /** 
  1191      * Identical to region(start, limit, status) but also allows a start position without
  1192      *  resetting the region state.
  1193      * @param regionStart The region start
  1194      * @param regionLimit the limit of the region
  1195      * @param startIndex  The (native) index within the region bounds at which to begin searches.
  1196      * @param status A reference to a UErrorCode to receive any errors.
  1197      *                If startIndex is not within the specified region bounds, 
  1198      *                U_INDEX_OUTOFBOUNDS_ERROR is returned.
  1199      * @stable ICU 4.6
  1200      */
  1201      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
  1203    /**
  1204      * Reports the start index of this matcher's region. The searches this matcher
  1205      * conducts are limited to finding matches within regionStart (inclusive) and
  1206      * regionEnd (exclusive).
  1208      * @return The starting (native) index of this matcher's region.
  1209      * @stable ICU 4.0
  1210      */
  1211      virtual int32_t regionStart() const;
  1213    /**
  1214      * Reports the start index of this matcher's region. The searches this matcher
  1215      * conducts are limited to finding matches within regionStart (inclusive) and
  1216      * regionEnd (exclusive).
  1218      * @return The starting (native) index of this matcher's region.
  1219      * @stable ICU 4.6
  1220      */
  1221      virtual int64_t regionStart64() const;
  1224     /**
  1225       * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1226       * this matcher conducts are limited to finding matches within regionStart
  1227       * (inclusive) and regionEnd (exclusive).
  1229       * @return The ending point (native) of this matcher's region.
  1230       * @stable ICU 4.0
  1231       */
  1232       virtual int32_t regionEnd() const;
  1234    /**
  1235      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1236      * this matcher conducts are limited to finding matches within regionStart
  1237      * (inclusive) and regionEnd (exclusive).
  1239      * @return The ending point (native) of this matcher's region.
  1240      * @stable ICU 4.6
  1241      */
  1242       virtual int64_t regionEnd64() const;
  1244     /**
  1245       * Queries the transparency of region bounds for this matcher.
  1246       * See useTransparentBounds for a description of transparent and opaque bounds.
  1247       * By default, a matcher uses opaque region boundaries.
  1249       * @return TRUE if this matcher is using opaque bounds, false if it is not.
  1250       * @stable ICU 4.0
  1251       */
  1252       virtual UBool hasTransparentBounds() const;
  1254     /**
  1255       * Sets the transparency of region bounds for this matcher.
  1256       * Invoking this function with an argument of true will set this matcher to use transparent bounds.
  1257       * If the boolean argument is false, then opaque bounds will be used.
  1259       * Using transparent bounds, the boundaries of this matcher's region are transparent
  1260       * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  1261       * see text beyond the boundaries of the region while checking for a match.
  1263       * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
  1264       * lookbehind, and boundary matching constructs.
  1266       * By default, a matcher uses opaque bounds.
  1268       * @param   b TRUE for transparent bounds; FALSE for opaque bounds
  1269       * @return  This Matcher;
  1270       * @stable ICU 4.0
  1271       **/
  1272       virtual RegexMatcher &useTransparentBounds(UBool b);
  1275     /**
  1276       * Return true if this matcher is using anchoring bounds.
  1277       * By default, matchers use anchoring region bounds.
  1279       * @return TRUE if this matcher is using anchoring bounds.
  1280       * @stable ICU 4.0
  1281       */    
  1282       virtual UBool hasAnchoringBounds() const;
  1285     /**
  1286       * Set whether this matcher is using Anchoring Bounds for its region.
  1287       * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  1288       * and end of the region.  Without Anchoring Bounds, anchors will only match at
  1289       * the positions they would in the complete text.
  1291       * Anchoring Bounds are the default for regions.
  1293       * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
  1294       * @return  This Matcher
  1295       * @stable ICU 4.0
  1296       */
  1297       virtual RegexMatcher &useAnchoringBounds(UBool b);
  1300     /**
  1301       * Return TRUE if the most recent matching operation attempted to access
  1302       *  additional input beyond the available input text.
  1303       *  In this case, additional input text could change the results of the match.
  1305       *  hitEnd() is defined for both successful and unsuccessful matches.
  1306       *  In either case hitEnd() will return TRUE if if the end of the text was
  1307       *  reached at any point during the matching process.
  1309       *  @return  TRUE if the most recent match hit the end of input
  1310       *  @stable ICU 4.0
  1311       */
  1312       virtual UBool hitEnd() const;
  1314     /**
  1315       * Return TRUE the most recent match succeeded and additional input could cause
  1316       * it to fail. If this method returns false and a match was found, then more input
  1317       * might change the match but the match won't be lost. If a match was not found,
  1318       * then requireEnd has no meaning.
  1320       * @return TRUE if more input could cause the most recent match to no longer match.
  1321       * @stable ICU 4.0
  1322       */
  1323       virtual UBool requireEnd() const;
  1326    /**
  1327     *    Returns the pattern that is interpreted by this matcher.
  1328     *    @return  the RegexPattern for this RegexMatcher
  1329     *    @stable ICU 2.4
  1330     */
  1331     virtual const RegexPattern &pattern() const;
  1334    /**
  1335     *    Replaces every substring of the input that matches the pattern
  1336     *    with the given replacement string.  This is a convenience function that
  1337     *    provides a complete find-and-replace-all operation.
  1339     *    This method first resets this matcher. It then scans the input string
  1340     *    looking for matches of the pattern. Input that is not part of any
  1341     *    match is left unchanged; each match is replaced in the result by the
  1342     *    replacement string. The replacement string may contain references to
  1343     *    capture groups.
  1345     *    @param   replacement a string containing the replacement text.
  1346     *    @param   status      a reference to a UErrorCode to receive any errors.
  1347     *    @return              a string containing the results of the find and replace.
  1348     *    @stable ICU 2.4
  1349     */
  1350     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
  1353    /**
  1354     *    Replaces every substring of the input that matches the pattern
  1355     *    with the given replacement string.  This is a convenience function that
  1356     *    provides a complete find-and-replace-all operation.
  1358     *    This method first resets this matcher. It then scans the input string
  1359     *    looking for matches of the pattern. Input that is not part of any
  1360     *    match is left unchanged; each match is replaced in the result by the
  1361     *    replacement string. The replacement string may contain references to
  1362     *    capture groups.
  1364     *    @param   replacement a string containing the replacement text.
  1365     *    @param   dest        a mutable UText in which the results are placed.
  1366     *                          If NULL, a new UText will be created (which may not be mutable).
  1367     *    @param   status      a reference to a UErrorCode to receive any errors.
  1368     *    @return              a string containing the results of the find and replace.
  1369     *                          If a pre-allocated UText was provided, it will always be used and returned.
  1371     *    @stable ICU 4.6
  1372     */
  1373     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
  1376    /**
  1377     * Replaces the first substring of the input that matches
  1378     * the pattern with the replacement string.   This is a convenience
  1379     * function that provides a complete find-and-replace operation.
  1381     * <p>This function first resets this RegexMatcher. It then scans the input string
  1382     * looking for a match of the pattern. Input that is not part
  1383     * of the match is appended directly to the result string; the match is replaced
  1384     * in the result by the replacement string. The replacement string may contain
  1385     * references to captured groups.</p>
  1387     * <p>The state of the matcher (the position at which a subsequent find()
  1388     *    would begin) after completing a replaceFirst() is not specified.  The
  1389     *    RegexMatcher should be reset before doing additional find() operations.</p>
  1391     *    @param   replacement a string containing the replacement text.
  1392     *    @param   status      a reference to a UErrorCode to receive any errors.
  1393     *    @return              a string containing the results of the find and replace.
  1394     *    @stable ICU 2.4
  1395     */
  1396     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
  1399    /**
  1400     * Replaces the first substring of the input that matches
  1401     * the pattern with the replacement string.   This is a convenience
  1402     * function that provides a complete find-and-replace operation.
  1404     * <p>This function first resets this RegexMatcher. It then scans the input string
  1405     * looking for a match of the pattern. Input that is not part
  1406     * of the match is appended directly to the result string; the match is replaced
  1407     * in the result by the replacement string. The replacement string may contain
  1408     * references to captured groups.</p>
  1410     * <p>The state of the matcher (the position at which a subsequent find()
  1411     *    would begin) after completing a replaceFirst() is not specified.  The
  1412     *    RegexMatcher should be reset before doing additional find() operations.</p>
  1414     *    @param   replacement a string containing the replacement text.
  1415     *    @param   dest        a mutable UText in which the results are placed.
  1416     *                          If NULL, a new UText will be created (which may not be mutable).
  1417     *    @param   status      a reference to a UErrorCode to receive any errors.
  1418     *    @return              a string containing the results of the find and replace.
  1419     *                          If a pre-allocated UText was provided, it will always be used and returned.
  1421     *    @stable ICU 4.6
  1422     */
  1423     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
  1426    /**
  1427     *   Implements a replace operation intended to be used as part of an
  1428     *   incremental find-and-replace.
  1430     *   <p>The input string, starting from the end of the previous replacement and ending at
  1431     *   the start of the current match, is appended to the destination string.  Then the
  1432     *   replacement string is appended to the output string,
  1433     *   including handling any substitutions of captured text.</p>
  1435     *   <p>For simple, prepackaged, non-incremental find-and-replace
  1436     *   operations, see replaceFirst() or replaceAll().</p>
  1438     *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
  1439     *   @param   replacement A UnicodeString that provides the text to be substituted for
  1440     *                        the input text that matched the regexp pattern.  The replacement
  1441     *                        text may contain references to captured text from the
  1442     *                        input.
  1443     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
  1444     *                        errors are  U_REGEX_INVALID_STATE if no match has been
  1445     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1446     *                        if the replacement text specifies a capture group that
  1447     *                        does not exist in the pattern.
  1449     *   @return  this  RegexMatcher
  1450     *   @stable ICU 2.4
  1452     */
  1453     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
  1454         const UnicodeString &replacement, UErrorCode &status);
  1457    /**
  1458     *   Implements a replace operation intended to be used as part of an
  1459     *   incremental find-and-replace.
  1461     *   <p>The input string, starting from the end of the previous replacement and ending at
  1462     *   the start of the current match, is appended to the destination string.  Then the
  1463     *   replacement string is appended to the output string,
  1464     *   including handling any substitutions of captured text.</p>
  1466     *   <p>For simple, prepackaged, non-incremental find-and-replace
  1467     *   operations, see replaceFirst() or replaceAll().</p>
  1469     *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
  1470     *                         Must not be NULL.
  1471     *   @param   replacement A UText that provides the text to be substituted for
  1472     *                        the input text that matched the regexp pattern.  The replacement
  1473     *                        text may contain references to captured text from the input.
  1474     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
  1475     *                        errors are  U_REGEX_INVALID_STATE if no match has been
  1476     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1477     *                        if the replacement text specifies a capture group that
  1478     *                        does not exist in the pattern.
  1480     *   @return  this  RegexMatcher
  1482     *   @stable ICU 4.6
  1483     */
  1484     virtual RegexMatcher &appendReplacement(UText *dest,
  1485         UText *replacement, UErrorCode &status);
  1488    /**
  1489     * As the final step in a find-and-replace operation, append the remainder
  1490     * of the input string, starting at the position following the last appendReplacement(),
  1491     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
  1492     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
  1494     *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
  1495     *  @return  the destination string.
  1496     *  @stable ICU 2.4
  1497     */
  1498     virtual UnicodeString &appendTail(UnicodeString &dest);
  1501    /**
  1502     * As the final step in a find-and-replace operation, append the remainder
  1503     * of the input string, starting at the position following the last appendReplacement(),
  1504     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
  1505     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
  1507     *  @param dest A mutable UText to which the results of the find-and-replace are appended.
  1508     *               Must not be NULL.
  1509     *  @param status error cod
  1510     *  @return  the destination string.
  1512     *  @stable ICU 4.6
  1513     */
  1514     virtual UText *appendTail(UText *dest, UErrorCode &status);
  1517     /**
  1518      * Split a string into fields.  Somewhat like split() from Perl.
  1519      * The pattern matches identify delimiters that separate the input
  1520      *  into fields.  The input data between the matches becomes the
  1521      *  fields themselves.
  1523      * @param input   The string to be split into fields.  The field delimiters
  1524      *                match the pattern (in the "this" object).  This matcher
  1525      *                will be reset to this input string.
  1526      * @param dest    An array of UnicodeStrings to receive the results of the split.
  1527      *                This is an array of actual UnicodeString objects, not an
  1528      *                array of pointers to strings.  Local (stack based) arrays can
  1529      *                work well here.
  1530      * @param destCapacity  The number of elements in the destination array.
  1531      *                If the number of fields found is less than destCapacity, the
  1532      *                extra strings in the destination array are not altered.
  1533      *                If the number of destination strings is less than the number
  1534      *                of fields, the trailing part of the input string, including any
  1535      *                field delimiters, is placed in the last destination string.
  1536      * @param status  A reference to a UErrorCode to receive any errors.
  1537      * @return        The number of fields into which the input string was split.
  1538      * @stable ICU 2.6
  1539      */
  1540     virtual int32_t  split(const UnicodeString &input,
  1541         UnicodeString    dest[],
  1542         int32_t          destCapacity,
  1543         UErrorCode       &status);
  1546     /**
  1547      * Split a string into fields.  Somewhat like split() from Perl.
  1548      * The pattern matches identify delimiters that separate the input
  1549      *  into fields.  The input data between the matches becomes the
  1550      *  fields themselves.
  1552      * @param input   The string to be split into fields.  The field delimiters
  1553      *                match the pattern (in the "this" object).  This matcher
  1554      *                will be reset to this input string.
  1555      * @param dest    An array of mutable UText structs to receive the results of the split.
  1556      *                If a field is NULL, a new UText is allocated to contain the results for
  1557      *                that field. This new UText is not guaranteed to be mutable.
  1558      * @param destCapacity  The number of elements in the destination array.
  1559      *                If the number of fields found is less than destCapacity, the
  1560      *                extra strings in the destination array are not altered.
  1561      *                If the number of destination strings is less than the number
  1562      *                of fields, the trailing part of the input string, including any
  1563      *                field delimiters, is placed in the last destination string.
  1564      * @param status  A reference to a UErrorCode to receive any errors.
  1565      * @return        The number of fields into which the input string was split.
  1567      * @stable ICU 4.6
  1568      */
  1569     virtual int32_t  split(UText *input,
  1570         UText           *dest[],
  1571         int32_t          destCapacity,
  1572         UErrorCode       &status);
  1574   /**
  1575     *   Set a processing time limit for match operations with this Matcher.
  1577     *   Some patterns, when matching certain strings, can run in exponential time.
  1578     *   For practical purposes, the match operation may appear to be in an
  1579     *   infinite loop.
  1580     *   When a limit is set a match operation will fail with an error if the
  1581     *   limit is exceeded.
  1582     *   <p>
  1583     *   The units of the limit are steps of the match engine.
  1584     *   Correspondence with actual processor time will depend on the speed
  1585     *   of the processor and the details of the specific pattern, but will
  1586     *   typically be on the order of milliseconds.
  1587     *   <p>
  1588     *   By default, the matching time is not limited.
  1589     *   <p>
  1591     *   @param   limit       The limit value, or 0 for no limit.
  1592     *   @param   status      A reference to a UErrorCode to receive any errors.
  1593     *   @stable ICU 4.0
  1594     */
  1595     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
  1597   /**
  1598     * Get the time limit, if any, for match operations made with this Matcher.
  1600     *   @return the maximum allowed time for a match, in units of processing steps.
  1601     *   @stable ICU 4.0
  1602     */
  1603     virtual int32_t getTimeLimit() const;
  1605   /**
  1606     *  Set the amount of heap storage available for use by the match backtracking stack.
  1607     *  The matcher is also reset, discarding any results from previous matches.
  1608     *  <p>
  1609     *  ICU uses a backtracking regular expression engine, with the backtrack stack
  1610     *  maintained on the heap.  This function sets the limit to the amount of memory
  1611     *  that can be used  for this purpose.  A backtracking stack overflow will
  1612     *  result in an error from the match operation that caused it.
  1613     *  <p>
  1614     *  A limit is desirable because a malicious or poorly designed pattern can use
  1615     *  excessive memory, potentially crashing the process.  A limit is enabled
  1616     *  by default.
  1617     *  <p>
  1618     *  @param limit  The maximum size, in bytes, of the matching backtrack stack.
  1619     *                A value of zero means no limit.
  1620     *                The limit must be greater or equal to zero.
  1622     *  @param status   A reference to a UErrorCode to receive any errors.
  1624     *  @stable ICU 4.0
  1625     */
  1626     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
  1628   /**
  1629     *  Get the size of the heap storage available for use by the back tracking stack.
  1631     *  @return  the maximum backtracking stack size, in bytes, or zero if the
  1632     *           stack size is unlimited.
  1633     *  @stable ICU 4.0
  1634     */
  1635     virtual int32_t  getStackLimit() const;
  1638   /**
  1639     * Set a callback function for use with this Matcher.
  1640     * During matching operations the function will be called periodically,
  1641     * giving the application the opportunity to terminate a long-running
  1642     * match.
  1644     *    @param   callback    A pointer to the user-supplied callback function.
  1645     *    @param   context     User context pointer.  The value supplied at the
  1646     *                         time the callback function is set will be saved
  1647     *                         and passed to the callback each time that it is called.
  1648     *    @param   status      A reference to a UErrorCode to receive any errors.
  1649     *  @stable ICU 4.0
  1650     */
  1651     virtual void setMatchCallback(URegexMatchCallback     *callback,
  1652                                   const void              *context,
  1653                                   UErrorCode              &status);
  1656   /**
  1657     *  Get the callback function for this URegularExpression.
  1659     *    @param   callback    Out parameter, receives a pointer to the user-supplied 
  1660     *                         callback function.
  1661     *    @param   context     Out parameter, receives the user context pointer that
  1662     *                         was set when uregex_setMatchCallback() was called.
  1663     *    @param   status      A reference to a UErrorCode to receive any errors.
  1664     *    @stable ICU 4.0
  1665     */
  1666     virtual void getMatchCallback(URegexMatchCallback     *&callback,
  1667                                   const void              *&context,
  1668                                   UErrorCode              &status);
  1671   /**
  1672     * Set a progress callback function for use with find operations on this Matcher.
  1673     * During find operations, the callback will be invoked after each return from a
  1674     * match attempt, giving the application the opportunity to terminate a long-running
  1675     * find operation.
  1677     *    @param   callback    A pointer to the user-supplied callback function.
  1678     *    @param   context     User context pointer.  The value supplied at the
  1679     *                         time the callback function is set will be saved
  1680     *                         and passed to the callback each time that it is called.
  1681     *    @param   status      A reference to a UErrorCode to receive any errors.
  1682     *    @stable ICU 4.6
  1683     */
  1684     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
  1685                                               const void                              *context,
  1686                                               UErrorCode                              &status);
  1689   /**
  1690     *  Get the find progress callback function for this URegularExpression.
  1692     *    @param   callback    Out parameter, receives a pointer to the user-supplied 
  1693     *                         callback function.
  1694     *    @param   context     Out parameter, receives the user context pointer that
  1695     *                         was set when uregex_setFindProgressCallback() was called.
  1696     *    @param   status      A reference to a UErrorCode to receive any errors.
  1697     *    @stable ICU 4.6
  1698     */
  1699     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
  1700                                               const void                      *&context,
  1701                                               UErrorCode                      &status);
  1703 #ifndef U_HIDE_INTERNAL_API
  1704    /**
  1705      *   setTrace   Debug function, enable/disable tracing of the matching engine.
  1706      *              For internal ICU development use only.  DO NO USE!!!!
  1707      *   @internal
  1708      */
  1709     void setTrace(UBool state);
  1710 #endif  /* U_HIDE_INTERNAL_API */
  1712     /**
  1713     * ICU "poor man's RTTI", returns a UClassID for this class.
  1715     * @stable ICU 2.2
  1716     */
  1717     static UClassID U_EXPORT2 getStaticClassID();
  1719     /**
  1720      * ICU "poor man's RTTI", returns a UClassID for the actual class.
  1722      * @stable ICU 2.2
  1723      */
  1724     virtual UClassID getDynamicClassID() const;
  1726 private:
  1727     // Constructors and other object boilerplate are private.
  1728     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
  1729     RegexMatcher();                  // default constructor not implemented
  1730     RegexMatcher(const RegexPattern *pat);
  1731     RegexMatcher(const RegexMatcher &other);
  1732     RegexMatcher &operator =(const RegexMatcher &rhs);
  1733     void init(UErrorCode &status);                      // Common initialization
  1734     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
  1736     friend class RegexPattern;
  1737     friend class RegexCImpl;
  1738 public:
  1739 #ifndef U_HIDE_INTERNAL_API
  1740     /** @internal  */
  1741     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
  1742 #endif  /* U_HIDE_INTERNAL_API */
  1743 private:
  1745     //
  1746     //  MatchAt   This is the internal interface to the match engine itself.
  1747     //            Match status comes back in matcher member variables.
  1748     //
  1749     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
  1750     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
  1751     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
  1752     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
  1753     REStackFrame        *resetStack();
  1754     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
  1755     void                 IncrementTime(UErrorCode &status);
  1756     UBool                ReportFindProgress(int64_t matchIndex, UErrorCode &status);
  1758     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
  1760     UBool                findUsingChunk();
  1761     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
  1762     UBool                isChunkWordBoundary(int32_t pos);
  1764     const RegexPattern  *fPattern;
  1765     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
  1766                                            //   should delete it when through.
  1768     const UnicodeString *fInput;           // The string being matched. Only used for input()
  1769     UText               *fInputText;       // The text being matched. Is never NULL.
  1770     UText               *fAltInputText;    // A shallow copy of the text being matched.
  1771                                            //   Only created if the pattern contains backreferences.
  1772     int64_t              fInputLength;     // Full length of the input text.
  1773     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
  1775     int64_t              fRegionStart;     // Start of the input region, default = 0.
  1776     int64_t              fRegionLimit;     // End of input region, default to input.length.
  1778     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
  1779     int64_t              fAnchorLimit;     //   See useAnchoringBounds
  1781     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
  1782     int64_t              fLookLimit;       //   and other boundary tests.  See
  1783                                            //   useTransparentBounds
  1785     int64_t              fActiveStart;     // Currently active bounds for matching.
  1786     int64_t              fActiveLimit;     //   Usually is the same as region, but
  1787                                            //   is changed to fLookStart/Limit when
  1788                                            //   entering look around regions.
  1790     UBool                fTransparentBounds;  // True if using transparent bounds.
  1791     UBool                fAnchoringBounds; // True if using anchoring bounds.
  1793     UBool                fMatch;           // True if the last attempted match was successful.
  1794     int64_t              fMatchStart;      // Position of the start of the most recent match
  1795     int64_t              fMatchEnd;        // First position after the end of the most recent match
  1796                                            //   Zero if no previous match, even when a region
  1797                                            //   is active.
  1798     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
  1799                                            //   or -1 if there was no previous match.
  1800     int64_t              fAppendPosition;  // First position after the end of the previous
  1801                                            //   appendReplacement().  As described by the
  1802                                            //   JavaDoc for Java Matcher, where it is called 
  1803                                            //   "append position"
  1804     UBool                fHitEnd;          // True if the last match touched the end of input.
  1805     UBool                fRequireEnd;      // True if the last match required end-of-input
  1806                                            //    (matched $ or Z)
  1808     UVector64           *fStack;
  1809     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
  1810                                            //   which will contain the capture group results.
  1811                                            //   NOT valid while match engine is running.
  1813     int64_t             *fData;            // Data area for use by the compiled pattern.
  1814     int64_t             fSmallData[8];     //   Use this for data if it's enough.
  1816     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
  1817                                            //   match engine run.  Zero for unlimited.
  1819     int32_t             fTime;             // Match time, accumulates while matching.
  1820     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
  1821                                            //   Kept separately from fTime to keep as much
  1822                                            //   code as possible out of the inline
  1823                                            //   StateSave function.
  1825     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
  1826                                            //   stack, in bytes.  Zero for unlimited.
  1828     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
  1829                                            //   NULL if there is no callback.
  1830     const void         *fCallbackContext;  // User Context ptr for callback function.
  1832     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
  1833                                                            //   NULL if there is no callback.
  1834     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
  1837     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
  1839     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
  1841     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
  1842                                            //   reported, or that permanently disables this matcher.
  1844     RuleBasedBreakIterator  *fWordBreakItr;
  1845 };
  1847 U_NAMESPACE_END
  1848 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
  1849 #endif

mercurial