intl/icu/source/i18n/rematch.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **************************************************************************
     3 *   Copyright (C) 2002-2013 International Business Machines Corporation  *
     4 *   and others. All rights reserved.                                     *
     5 **************************************************************************
     6 */
     7 //
     8 //  file:  rematch.cpp
     9 //
    10 //         Contains the implementation of class RegexMatcher,
    11 //         which is one of the main API classes for the ICU regular expression package.
    12 //
    14 #include "unicode/utypes.h"
    15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    17 #include "unicode/regex.h"
    18 #include "unicode/uniset.h"
    19 #include "unicode/uchar.h"
    20 #include "unicode/ustring.h"
    21 #include "unicode/rbbi.h"
    22 #include "unicode/utf.h"
    23 #include "unicode/utf16.h"
    24 #include "uassert.h"
    25 #include "cmemory.h"
    26 #include "uvector.h"
    27 #include "uvectr32.h"
    28 #include "uvectr64.h"
    29 #include "regeximp.h"
    30 #include "regexst.h"
    31 #include "regextxt.h"
    32 #include "ucase.h"
    34 // #include <malloc.h>        // Needed for heapcheck testing
    37 // Find progress callback
    38 // ----------------------
    39 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
    40 //
    41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
    42     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
    45 // Smart Backtracking
    46 // ------------------
    47 // When a failure would go back to a LOOP_C instruction,
    48 // strings, characters, and setrefs scan backwards for a valid start
    49 // character themselves, pop the stack, and save state, emulating the
    50 // LOOP_C's effect but assured that the next character of input is a
    51 // possible matching character.
    52 //
    53 // Good idea in theory; unfortunately it only helps out a few specific
    54 // cases and slows the engine down a little in the rest.
    56 U_NAMESPACE_BEGIN
    58 // Default limit for the size of the back track stack, to avoid system
    59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
    60 // This value puts ICU's limits higher than most other regexp implementations,
    61 //    which use recursion rather than the heap, and take more storage per
    62 //    backtrack point.
    63 //
    64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
    66 // Time limit counter constant.
    67 //   Time limits for expression evaluation are in terms of quanta of work by
    68 //   the engine, each of which is 10,000 state saves.
    69 //   This constant determines that state saves per tick number.
    70 static const int32_t TIMER_INITIAL_VALUE = 10000;
    72 //-----------------------------------------------------------------------------
    73 //
    74 //   Constructor and Destructor
    75 //
    76 //-----------------------------------------------------------------------------
    77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  { 
    78     fDeferredStatus = U_ZERO_ERROR;
    79     init(fDeferredStatus);
    80     if (U_FAILURE(fDeferredStatus)) {
    81         return;
    82     }
    83     if (pat==NULL) {
    84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
    85         return;
    86     }
    87     fPattern = pat;
    88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
    89 }
    93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
    94                            uint32_t flags, UErrorCode &status) {
    95     init(status);
    96     if (U_FAILURE(status)) {
    97         return;
    98     }
    99     UParseError    pe;
   100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   101     fPattern           = fPatternOwned;
   103     UText inputText = UTEXT_INITIALIZER;
   104     utext_openConstUnicodeString(&inputText, &input, &status);
   105     init2(&inputText, status);
   106     utext_close(&inputText);
   108     fInputUniStrMaybeMutable = TRUE;    
   109 }
   112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
   113                            uint32_t flags, UErrorCode &status) {
   114     init(status);
   115     if (U_FAILURE(status)) {
   116         return;
   117     }
   118     UParseError    pe;
   119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   120     if (U_FAILURE(status)) {
   121         return;
   122     }
   124     fPattern           = fPatternOwned;
   125     init2(input, status);
   126 }
   129 RegexMatcher::RegexMatcher(const UnicodeString &regexp, 
   130                            uint32_t flags, UErrorCode &status) {
   131     init(status);
   132     if (U_FAILURE(status)) {
   133         return;
   134     }
   135     UParseError    pe;
   136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   137     if (U_FAILURE(status)) {
   138         return;
   139     }
   140     fPattern           = fPatternOwned;
   141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
   142 }
   144 RegexMatcher::RegexMatcher(UText *regexp, 
   145                            uint32_t flags, UErrorCode &status) {
   146     init(status);
   147     if (U_FAILURE(status)) {
   148         return;
   149     }
   150     UParseError    pe;
   151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   152         if (U_FAILURE(status)) {
   153         return;
   154     }
   156     fPattern           = fPatternOwned;
   157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
   158 }
   163 RegexMatcher::~RegexMatcher() {
   164     delete fStack;
   165     if (fData != fSmallData) {
   166         uprv_free(fData);
   167         fData = NULL;
   168     }
   169     if (fPatternOwned) {
   170         delete fPatternOwned;
   171         fPatternOwned = NULL;
   172         fPattern = NULL;
   173     }
   175     if (fInput) {
   176         delete fInput;
   177     }
   178     if (fInputText) {
   179         utext_close(fInputText);
   180     }
   181     if (fAltInputText) {
   182         utext_close(fAltInputText);
   183     }
   185     #if UCONFIG_NO_BREAK_ITERATION==0
   186     delete fWordBreakItr;
   187     #endif
   188 }
   190 //
   191 //   init()   common initialization for use by all constructors.
   192 //            Initialize all fields, get the object into a consistent state.
   193 //            This must be done even when the initial status shows an error,
   194 //            so that the object is initialized sufficiently well for the destructor
   195 //            to run safely.
   196 //
   197 void RegexMatcher::init(UErrorCode &status) {
   198     fPattern           = NULL;
   199     fPatternOwned      = NULL;
   200     fFrameSize         = 0;
   201     fRegionStart       = 0;
   202     fRegionLimit       = 0;
   203     fAnchorStart       = 0;
   204     fAnchorLimit       = 0;
   205     fLookStart         = 0;
   206     fLookLimit         = 0;
   207     fActiveStart       = 0;
   208     fActiveLimit       = 0;
   209     fTransparentBounds = FALSE;
   210     fAnchoringBounds   = TRUE;
   211     fMatch             = FALSE;
   212     fMatchStart        = 0;
   213     fMatchEnd          = 0;
   214     fLastMatchEnd      = -1;
   215     fAppendPosition    = 0;
   216     fHitEnd            = FALSE;
   217     fRequireEnd        = FALSE;
   218     fStack             = NULL;
   219     fFrame             = NULL;
   220     fTimeLimit         = 0;
   221     fTime              = 0;
   222     fTickCounter       = 0;
   223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
   224     fCallbackFn        = NULL;
   225     fCallbackContext   = NULL;
   226     fFindProgressCallbackFn      = NULL;
   227     fFindProgressCallbackContext = NULL;
   228     fTraceDebug        = FALSE;
   229     fDeferredStatus    = status;
   230     fData              = fSmallData;
   231     fWordBreakItr      = NULL;
   233     fStack             = NULL;
   234     fInputText         = NULL;
   235     fAltInputText      = NULL;
   236     fInput             = NULL;
   237     fInputLength       = 0;
   238     fInputUniStrMaybeMutable = FALSE;
   240     if (U_FAILURE(status)) {
   241         fDeferredStatus = status;
   242     }
   243 }
   245 //
   246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
   247 //            This handles the common setup to be done after the Pattern is available.
   248 //
   249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
   250     if (U_FAILURE(status)) {
   251         fDeferredStatus = status;
   252         return;
   253     }
   255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
   256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 
   257         if (fData == NULL) {
   258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   259             return;
   260         }
   261     }
   263     fStack = new UVector64(status);
   264     if (fStack == NULL) {
   265         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   266         return;
   267     }
   269     reset(input);
   270     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
   271     if (U_FAILURE(status)) {
   272         fDeferredStatus = status;
   273         return;
   274     }
   275 }
   278 static const UChar BACKSLASH  = 0x5c;
   279 static const UChar DOLLARSIGN = 0x24;
   280 //--------------------------------------------------------------------------------
   281 //
   282 //    appendReplacement
   283 //
   284 //--------------------------------------------------------------------------------
   285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
   286                                               const UnicodeString &replacement,
   287                                               UErrorCode &status) {
   288     UText replacementText = UTEXT_INITIALIZER;
   290     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   291     if (U_SUCCESS(status)) {        
   292         UText resultText = UTEXT_INITIALIZER;
   293         utext_openUnicodeString(&resultText, &dest, &status);
   295         if (U_SUCCESS(status)) {
   296             appendReplacement(&resultText, &replacementText, status);
   297             utext_close(&resultText);
   298         }
   299         utext_close(&replacementText);
   300     }
   302     return *this;
   303 }
   305 //
   306 //    appendReplacement, UText mode
   307 //
   308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
   309                                               UText *replacement,
   310                                               UErrorCode &status) {
   311     if (U_FAILURE(status)) {
   312         return *this;
   313     }
   314     if (U_FAILURE(fDeferredStatus)) {
   315         status = fDeferredStatus;
   316         return *this;
   317     }
   318     if (fMatch == FALSE) {
   319         status = U_REGEX_INVALID_STATE;
   320         return *this;
   321     }
   323     // Copy input string from the end of previous match to start of current match
   324     int64_t  destLen = utext_nativeLength(dest);
   325     if (fMatchStart > fAppendPosition) {
   326         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   327             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 
   328                                      (int32_t)(fMatchStart-fAppendPosition), &status);
   329         } else {
   330             int32_t len16;
   331             if (UTEXT_USES_U16(fInputText)) {
   332                 len16 = (int32_t)(fMatchStart-fAppendPosition);
   333             } else {
   334                 UErrorCode lengthStatus = U_ZERO_ERROR;
   335                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
   336             }
   337             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   338             if (inputChars == NULL) {
   339                 status = U_MEMORY_ALLOCATION_ERROR;
   340                 return *this;
   341             }
   342             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
   343             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
   344             uprv_free(inputChars);
   345         }
   346     }
   347     fAppendPosition = fMatchEnd;
   350     // scan the replacement text, looking for substitutions ($n) and \escapes.
   351     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
   352     //         move entire ranges not containing substitutions.
   353     UTEXT_SETNATIVEINDEX(replacement, 0);
   354     UChar32 c = UTEXT_NEXT32(replacement);
   355     while (c != U_SENTINEL) {
   356         if (c == BACKSLASH) {
   357             // Backslash Escape.  Copy the following char out without further checks.
   358             //                    Note:  Surrogate pairs don't need any special handling
   359             //                           The second half wont be a '$' or a '\', and
   360             //                           will move to the dest normally on the next
   361             //                           loop iteration.
   362             c = UTEXT_CURRENT32(replacement);
   363             if (c == U_SENTINEL) {
   364                 break;
   365             }
   367             if (c==0x55/*U*/ || c==0x75/*u*/) {
   368                 // We have a \udddd or \Udddddddd escape sequence.
   369                 int32_t offset = 0;
   370                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
   371                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   372                 if (escapedChar != (UChar32)0xFFFFFFFF) {
   373                     if (U_IS_BMP(escapedChar)) {
   374                         UChar c16 = (UChar)escapedChar;
   375                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   376                     } else {
   377                         UChar surrogate[2];
   378                         surrogate[0] = U16_LEAD(escapedChar);
   379                         surrogate[1] = U16_TRAIL(escapedChar);
   380                         if (U_SUCCESS(status)) {
   381                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   382                         }
   383                     }
   384                     // TODO:  Report errors for mal-formed \u escapes?
   385                     //        As this is, the original sequence is output, which may be OK.
   386                     if (context.lastOffset == offset) {
   387                         (void)UTEXT_PREVIOUS32(replacement);
   388                     } else if (context.lastOffset != offset-1) {
   389                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
   390                     }
   391                 }
   392             } else {
   393                 (void)UTEXT_NEXT32(replacement);
   394                 // Plain backslash escape.  Just put out the escaped character.
   395                 if (U_IS_BMP(c)) {
   396                     UChar c16 = (UChar)c;
   397                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   398                 } else {
   399                     UChar surrogate[2];
   400                     surrogate[0] = U16_LEAD(c);
   401                     surrogate[1] = U16_TRAIL(c);
   402                     if (U_SUCCESS(status)) {
   403                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   404                     }
   405                 }
   406             }
   407         } else if (c != DOLLARSIGN) {
   408             // Normal char, not a $.  Copy it out without further checks.
   409             if (U_IS_BMP(c)) {
   410                 UChar c16 = (UChar)c;
   411                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   412             } else {
   413                 UChar surrogate[2];
   414                 surrogate[0] = U16_LEAD(c);
   415                 surrogate[1] = U16_TRAIL(c);
   416                 if (U_SUCCESS(status)) {
   417                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   418                 }
   419             }
   420         } else {
   421             // We've got a $.  Pick up a capture group number if one follows.
   422             // Consume at most the number of digits necessary for the largest capture
   423             // number that is valid for this pattern.
   425             int32_t numDigits = 0;
   426             int32_t groupNum  = 0;
   427             UChar32 digitC;
   428             for (;;) {
   429                 digitC = UTEXT_CURRENT32(replacement);
   430                 if (digitC == U_SENTINEL) {
   431                     break;
   432                 }
   433                 if (u_isdigit(digitC) == FALSE) {
   434                     break;
   435                 }
   436                 (void)UTEXT_NEXT32(replacement);
   437                 groupNum=groupNum*10 + u_charDigitValue(digitC);
   438                 numDigits++;
   439                 if (numDigits >= fPattern->fMaxCaptureDigits) {
   440                     break;
   441                 }
   442             }
   445             if (numDigits == 0) {
   446                 // The $ didn't introduce a group number at all.
   447                 // Treat it as just part of the substitution text.
   448                 UChar c16 = DOLLARSIGN;
   449                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   450             } else {
   451                 // Finally, append the capture group data to the destination.
   452                 destLen += appendGroup(groupNum, dest, status);
   453                 if (U_FAILURE(status)) {
   454                     // Can fail if group number is out of range.
   455                     break;
   456                 }
   457             }
   458         }
   460         if (U_FAILURE(status)) {
   461             break;
   462         } else {
   463             c = UTEXT_NEXT32(replacement);
   464         }
   465     }
   467     return *this;
   468 }
   472 //--------------------------------------------------------------------------------
   473 //
   474 //    appendTail     Intended to be used in conjunction with appendReplacement()
   475 //                   To the destination string, append everything following
   476 //                   the last match position from the input string.
   477 //
   478 //                   Note:  Match ranges do not affect appendTail or appendReplacement
   479 //
   480 //--------------------------------------------------------------------------------
   481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
   482     UErrorCode status = U_ZERO_ERROR;
   483     UText resultText = UTEXT_INITIALIZER;
   484     utext_openUnicodeString(&resultText, &dest, &status);
   486     if (U_SUCCESS(status)) {
   487         appendTail(&resultText, status);
   488         utext_close(&resultText);
   489     }
   491     return dest;
   492 }
   494 //
   495 //   appendTail, UText mode
   496 //
   497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
   498     UBool bailOut = FALSE;
   499     if (U_FAILURE(status)) {
   500         bailOut = TRUE;
   501     }
   502     if (U_FAILURE(fDeferredStatus)) {
   503         status = fDeferredStatus;
   504         bailOut = TRUE;
   505     }
   507     if (bailOut) {
   508         //  dest must not be NULL
   509         if (dest) {
   510             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
   511             return dest;
   512         }
   513     }
   515     if (fInputLength > fAppendPosition) {
   516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   517             int64_t destLen = utext_nativeLength(dest);
   518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 
   519                           (int32_t)(fInputLength-fAppendPosition), &status);
   520         } else {
   521             int32_t len16;
   522             if (UTEXT_USES_U16(fInputText)) {
   523                 len16 = (int32_t)(fInputLength-fAppendPosition);
   524             } else {
   525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
   526                 status = U_ZERO_ERROR; // buffer overflow
   527             }
   529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
   530             if (inputChars == NULL) {
   531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   532             } else {
   533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 
   534                 int64_t destLen = utext_nativeLength(dest);
   535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
   536                 uprv_free(inputChars);
   537             }
   538         }
   539     }
   540     return dest;
   541 }
   545 //--------------------------------------------------------------------------------
   546 //
   547 //   end
   548 //
   549 //--------------------------------------------------------------------------------
   550 int32_t RegexMatcher::end(UErrorCode &err) const {
   551     return end(0, err);
   552 }
   554 int64_t RegexMatcher::end64(UErrorCode &err) const {
   555     return end64(0, err);
   556 }
   558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
   559     if (U_FAILURE(err)) {
   560         return -1;
   561     }
   562     if (fMatch == FALSE) {
   563         err = U_REGEX_INVALID_STATE;
   564         return -1;
   565     }
   566     if (group < 0 || group > fPattern->fGroupMap->size()) {
   567         err = U_INDEX_OUTOFBOUNDS_ERROR;
   568         return -1;
   569     }
   570     int64_t e = -1;
   571     if (group == 0) {
   572         e = fMatchEnd; 
   573     } else {
   574         // Get the position within the stack frame of the variables for
   575         //    this capture group.
   576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   577         U_ASSERT(groupOffset < fPattern->fFrameSize);
   578         U_ASSERT(groupOffset >= 0);
   579         e = fFrame->fExtra[groupOffset + 1];
   580     }
   582         return e;
   583 }
   585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
   586     return (int32_t)end64(group, err);
   587 }
   590 //--------------------------------------------------------------------------------
   591 //
   592 //   find()
   593 //
   594 //--------------------------------------------------------------------------------
   595 UBool RegexMatcher::find() {
   596     // Start at the position of the last match end.  (Will be zero if the
   597     //   matcher has been reset.)
   598     //
   599     if (U_FAILURE(fDeferredStatus)) {
   600         return FALSE;
   601     }
   603     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   604         return findUsingChunk();
   605     }
   607     int64_t startPos = fMatchEnd;
   608     if (startPos==0) {
   609         startPos = fActiveStart;
   610     }
   612     if (fMatch) {
   613         // Save the position of any previous successful match.
   614         fLastMatchEnd = fMatchEnd;
   616         if (fMatchStart == fMatchEnd) {
   617             // Previous match had zero length.  Move start position up one position
   618             //  to avoid sending find() into a loop on zero-length matches.
   619             if (startPos >= fActiveLimit) {
   620                 fMatch = FALSE;
   621                 fHitEnd = TRUE;
   622                 return FALSE;
   623             }
   624             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   625             (void)UTEXT_NEXT32(fInputText);
   626             startPos = UTEXT_GETNATIVEINDEX(fInputText);
   627         }
   628     } else {
   629         if (fLastMatchEnd >= 0) {
   630             // A previous find() failed to match.  Don't try again.
   631             //   (without this test, a pattern with a zero-length match
   632             //    could match again at the end of an input string.)
   633             fHitEnd = TRUE;
   634             return FALSE;
   635         }
   636     }
   639     // Compute the position in the input string beyond which a match can not begin, because
   640     //   the minimum length match would extend past the end of the input.
   641     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
   642     //          Be aware of possible overflows if making changes here.
   643     int64_t testStartLimit;
   644     if (UTEXT_USES_U16(fInputText)) {
   645         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
   646         if (startPos > testStartLimit) {
   647             fMatch = FALSE;
   648             fHitEnd = TRUE;
   649             return FALSE;
   650         }
   651     } else {
   652         // For now, let the matcher discover that it can't match on its own
   653         // We don't know how long the match len is in native characters
   654         testStartLimit = fActiveLimit;
   655     }
   657     UChar32  c;
   658     U_ASSERT(startPos >= 0);
   660     switch (fPattern->fStartType) {
   661     case START_NO_INFO:
   662         // No optimization was found. 
   663         //  Try a match at each input position.
   664         for (;;) {
   665             MatchAt(startPos, FALSE, fDeferredStatus);
   666             if (U_FAILURE(fDeferredStatus)) {
   667                 return FALSE;
   668             }
   669             if (fMatch) {
   670                 return TRUE;
   671             }
   672             if (startPos >= testStartLimit) {
   673                 fHitEnd = TRUE;
   674                 return FALSE;
   675             }
   676             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   677             (void)UTEXT_NEXT32(fInputText);
   678             startPos = UTEXT_GETNATIVEINDEX(fInputText);
   679             // Note that it's perfectly OK for a pattern to have a zero-length
   680             //   match at the end of a string, so we must make sure that the loop
   681             //   runs with startPos == testStartLimit the last time through.
   682             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   683                 return FALSE;
   684         }
   685         U_ASSERT(FALSE);
   687     case START_START:
   688         // Matches are only possible at the start of the input string
   689         //   (pattern begins with ^ or \A)
   690         if (startPos > fActiveStart) {
   691             fMatch = FALSE;
   692             return FALSE;
   693         }
   694         MatchAt(startPos, FALSE, fDeferredStatus);
   695         if (U_FAILURE(fDeferredStatus)) {
   696             return FALSE;
   697         }
   698         return fMatch;
   701     case START_SET:
   702         {
   703             // Match may start on any char from a pre-computed set.
   704             U_ASSERT(fPattern->fMinMatchLen > 0);
   705             int64_t pos;
   706             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   707             for (;;) {
   708                 c = UTEXT_NEXT32(fInputText);
   709                 pos = UTEXT_GETNATIVEINDEX(fInputText);
   710                 // c will be -1 (U_SENTINEL) at end of text, in which case we
   711                 // skip this next block (so we don't have a negative array index)
   712                 // and handle end of text in the following block.
   713                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   714                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
   715                     MatchAt(startPos, FALSE, fDeferredStatus);
   716                     if (U_FAILURE(fDeferredStatus)) {
   717                         return FALSE;
   718                     }
   719                     if (fMatch) {
   720                         return TRUE;
   721                     }
   722                     UTEXT_SETNATIVEINDEX(fInputText, pos);
   723                 }
   724                 if (startPos >= testStartLimit) {
   725                     fMatch = FALSE;
   726                     fHitEnd = TRUE;
   727                     return FALSE;
   728                 }
   729                 startPos = pos;
   730 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   731                     return FALSE;
   732             }
   733         }
   734         U_ASSERT(FALSE);
   736     case START_STRING:
   737     case START_CHAR:
   738         {
   739             // Match starts on exactly one char.
   740             U_ASSERT(fPattern->fMinMatchLen > 0);
   741             UChar32 theChar = fPattern->fInitialChar;
   742             int64_t pos;
   743             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   744             for (;;) {
   745                 c = UTEXT_NEXT32(fInputText);
   746                 pos = UTEXT_GETNATIVEINDEX(fInputText);
   747                 if (c == theChar) {
   748                     MatchAt(startPos, FALSE, fDeferredStatus);
   749                     if (U_FAILURE(fDeferredStatus)) {
   750                         return FALSE;
   751                     }
   752                     if (fMatch) {
   753                         return TRUE;
   754                     }
   755                     UTEXT_SETNATIVEINDEX(fInputText, pos);
   756                 }
   757                 if (startPos >= testStartLimit) {
   758                     fMatch = FALSE;
   759                     fHitEnd = TRUE;
   760                     return FALSE;
   761                 }
   762                 startPos = pos;
   763 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   764                     return FALSE;
   765            }
   766         }
   767         U_ASSERT(FALSE);
   769     case START_LINE:
   770         {
   771             UChar32  c;
   772             if (startPos == fAnchorStart) {
   773                 MatchAt(startPos, FALSE, fDeferredStatus);
   774                 if (U_FAILURE(fDeferredStatus)) {
   775                     return FALSE;
   776                 }
   777                 if (fMatch) {
   778                     return TRUE;
   779                 }
   780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
   781                 c = UTEXT_NEXT32(fInputText);
   782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
   783             } else {
   784                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
   785                 c = UTEXT_PREVIOUS32(fInputText);
   786                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
   787             }
   789             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   790                 for (;;) {
   791                     if (c == 0x0a) {
   792                             MatchAt(startPos, FALSE, fDeferredStatus);
   793                             if (U_FAILURE(fDeferredStatus)) {
   794                                 return FALSE;
   795                             }
   796                             if (fMatch) {
   797                                 return TRUE;
   798                             }
   799                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   800                     }
   801                     if (startPos >= testStartLimit) {
   802                         fMatch = FALSE;
   803                         fHitEnd = TRUE;
   804                         return FALSE;
   805                     }
   806                     c = UTEXT_NEXT32(fInputText);
   807                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
   808                     // Note that it's perfectly OK for a pattern to have a zero-length
   809                     //   match at the end of a string, so we must make sure that the loop
   810                     //   runs with startPos == testStartLimit the last time through.
   811 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   812                         return FALSE;
   813                 }
   814             } else {
   815                 for (;;) {
   816                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   817                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   818                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   819                                 (void)UTEXT_NEXT32(fInputText);
   820                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
   821                             }
   822                             MatchAt(startPos, FALSE, fDeferredStatus);
   823                             if (U_FAILURE(fDeferredStatus)) {
   824                                 return FALSE;
   825                             }
   826                             if (fMatch) {
   827                                 return TRUE;
   828                             }
   829                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
   830                     }
   831                     if (startPos >= testStartLimit) {
   832                         fMatch = FALSE;
   833                         fHitEnd = TRUE;
   834                         return FALSE;
   835                     }
   836                     c = UTEXT_NEXT32(fInputText);
   837                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
   838                     // Note that it's perfectly OK for a pattern to have a zero-length
   839                     //   match at the end of a string, so we must make sure that the loop
   840                     //   runs with startPos == testStartLimit the last time through.
   841 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   842                         return FALSE;
   843                 }
   844             }
   845         }
   847     default:
   848         U_ASSERT(FALSE);
   849     }
   851     U_ASSERT(FALSE);
   852     return FALSE;
   853 }
   857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
   858     if (U_FAILURE(status)) {
   859         return FALSE;
   860     }
   861     if (U_FAILURE(fDeferredStatus)) {
   862         status = fDeferredStatus;
   863         return FALSE;
   864     }
   865     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
   866                                           //        This will reset the region to be the full input length.
   867     if (start < 0) {
   868         status = U_INDEX_OUTOFBOUNDS_ERROR;
   869         return FALSE;
   870     }
   872     int64_t nativeStart = start;
   873     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   874         status = U_INDEX_OUTOFBOUNDS_ERROR;
   875         return FALSE;
   876     }
   877     fMatchEnd = nativeStart;  
   878     return find();
   879 }
   882 //--------------------------------------------------------------------------------
   883 //
   884 //   findUsingChunk() -- like find(), but with the advance knowledge that the
   885 //                       entire string is available in the UText's chunk buffer.
   886 //
   887 //--------------------------------------------------------------------------------
   888 UBool RegexMatcher::findUsingChunk() {
   889     // Start at the position of the last match end.  (Will be zero if the
   890     //   matcher has been reset.
   891     //
   893     int32_t startPos = (int32_t)fMatchEnd;
   894     if (startPos==0) {
   895         startPos = (int32_t)fActiveStart;
   896     }
   898     const UChar *inputBuf = fInputText->chunkContents;
   900     if (fMatch) {
   901         // Save the position of any previous successful match.
   902         fLastMatchEnd = fMatchEnd;
   904         if (fMatchStart == fMatchEnd) {
   905             // Previous match had zero length.  Move start position up one position
   906             //  to avoid sending find() into a loop on zero-length matches.
   907             if (startPos >= fActiveLimit) {
   908                 fMatch = FALSE;
   909                 fHitEnd = TRUE;
   910                 return FALSE;
   911             }
   912             U16_FWD_1(inputBuf, startPos, fInputLength);
   913         }
   914     } else {
   915         if (fLastMatchEnd >= 0) {
   916             // A previous find() failed to match.  Don't try again.
   917             //   (without this test, a pattern with a zero-length match
   918             //    could match again at the end of an input string.)
   919             fHitEnd = TRUE;
   920             return FALSE;
   921         }
   922     }
   925     // Compute the position in the input string beyond which a match can not begin, because
   926     //   the minimum length match would extend past the end of the input.
   927     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
   928     //          Be aware of possible overflows if making changes here.
   929     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
   930     if (startPos > testLen) {
   931         fMatch = FALSE;
   932         fHitEnd = TRUE;
   933         return FALSE;
   934     }
   936     UChar32  c;
   937     U_ASSERT(startPos >= 0);
   939     switch (fPattern->fStartType) {
   940     case START_NO_INFO:
   941         // No optimization was found. 
   942         //  Try a match at each input position.
   943         for (;;) {
   944             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   945             if (U_FAILURE(fDeferredStatus)) {
   946                 return FALSE;
   947             }
   948             if (fMatch) {
   949                 return TRUE;
   950             }
   951             if (startPos >= testLen) {
   952                 fHitEnd = TRUE;
   953                 return FALSE;
   954             }
   955             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   956             // Note that it's perfectly OK for a pattern to have a zero-length
   957             //   match at the end of a string, so we must make sure that the loop
   958             //   runs with startPos == testLen the last time through.
   959             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   960                 return FALSE;
   961         }
   962         U_ASSERT(FALSE);
   964     case START_START:
   965         // Matches are only possible at the start of the input string
   966         //   (pattern begins with ^ or \A)
   967         if (startPos > fActiveStart) {
   968             fMatch = FALSE;
   969             return FALSE;
   970         }
   971         MatchChunkAt(startPos, FALSE, fDeferredStatus);
   972         if (U_FAILURE(fDeferredStatus)) {
   973             return FALSE;
   974         }
   975         return fMatch;
   978     case START_SET:
   979     {
   980         // Match may start on any char from a pre-computed set.
   981         U_ASSERT(fPattern->fMinMatchLen > 0);
   982         for (;;) {
   983             int32_t pos = startPos;
   984             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   985             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   986                 (c>=256 && fPattern->fInitialChars->contains(c))) {
   987                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   988                 if (U_FAILURE(fDeferredStatus)) {
   989                     return FALSE;
   990                 }
   991                 if (fMatch) {
   992                     return TRUE;
   993                 }
   994             }
   995             if (pos >= testLen) {
   996                 fMatch = FALSE;
   997                 fHitEnd = TRUE;
   998                 return FALSE;
   999             }
  1000             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1001                 return FALSE;
  1004         U_ASSERT(FALSE);
  1006     case START_STRING:
  1007     case START_CHAR:
  1009         // Match starts on exactly one char.
  1010         U_ASSERT(fPattern->fMinMatchLen > 0);
  1011         UChar32 theChar = fPattern->fInitialChar;
  1012         for (;;) {
  1013             int32_t pos = startPos;
  1014             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
  1015             if (c == theChar) {
  1016                 MatchChunkAt(pos, FALSE, fDeferredStatus);
  1017                 if (U_FAILURE(fDeferredStatus)) {
  1018                     return FALSE;
  1020                 if (fMatch) {
  1021                     return TRUE;
  1024             if (pos >= testLen) {
  1025                 fMatch = FALSE;
  1026                 fHitEnd = TRUE;
  1027                 return FALSE;
  1029             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1030                 return FALSE;
  1033         U_ASSERT(FALSE);
  1035     case START_LINE:
  1037         UChar32  c;
  1038         if (startPos == fAnchorStart) {
  1039             MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1040             if (U_FAILURE(fDeferredStatus)) {
  1041                 return FALSE;
  1043             if (fMatch) {
  1044                 return TRUE;
  1046             U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1049         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
  1050             for (;;) {
  1051                 c = inputBuf[startPos-1];
  1052                 if (c == 0x0a) {
  1053                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1054                     if (U_FAILURE(fDeferredStatus)) {
  1055                         return FALSE;
  1057                     if (fMatch) {
  1058                         return TRUE;
  1061                 if (startPos >= testLen) {
  1062                     fMatch = FALSE;
  1063                     fHitEnd = TRUE;
  1064                     return FALSE;
  1066                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1067                 // Note that it's perfectly OK for a pattern to have a zero-length
  1068                 //   match at the end of a string, so we must make sure that the loop
  1069                 //   runs with startPos == testLen the last time through.
  1070 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1071                     return FALSE;
  1073         } else {
  1074             for (;;) {
  1075                 c = inputBuf[startPos-1];
  1076                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  1077                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
  1078                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
  1079                         startPos++;
  1081                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1082                     if (U_FAILURE(fDeferredStatus)) {
  1083                         return FALSE;
  1085                     if (fMatch) {
  1086                         return TRUE;
  1089                 if (startPos >= testLen) {
  1090                     fMatch = FALSE;
  1091                     fHitEnd = TRUE;
  1092                     return FALSE;
  1094                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1095                 // Note that it's perfectly OK for a pattern to have a zero-length
  1096                 //   match at the end of a string, so we must make sure that the loop
  1097                 //   runs with startPos == testLen the last time through.
  1098 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1099                     return FALSE;
  1104     default:
  1105         U_ASSERT(FALSE);
  1108     U_ASSERT(FALSE);
  1109     return FALSE;
  1114 //--------------------------------------------------------------------------------
  1115 //
  1116 //  group()
  1117 //
  1118 //--------------------------------------------------------------------------------
  1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
  1120     return group(0, status);
  1123 //  Return immutable shallow clone
  1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
  1125     return group(0, dest, group_len, status);
  1128 //  Return immutable shallow clone
  1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
  1130     group_len = 0;
  1131     UBool bailOut = FALSE;
  1132     if (U_FAILURE(status)) {
  1133         return dest;
  1135     if (U_FAILURE(fDeferredStatus)) {
  1136         status = fDeferredStatus;
  1137         bailOut = TRUE;
  1139     if (fMatch == FALSE) {
  1140         status = U_REGEX_INVALID_STATE;
  1141         bailOut = TRUE;
  1143     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1144         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1145         bailOut = TRUE;
  1148     if (bailOut) {
  1149         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
  1152     int64_t s, e;
  1153     if (groupNum == 0) {
  1154         s = fMatchStart;
  1155         e = fMatchEnd;
  1156     } else {
  1157         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1158         U_ASSERT(groupOffset < fPattern->fFrameSize);
  1159         U_ASSERT(groupOffset >= 0);
  1160         s = fFrame->fExtra[groupOffset];
  1161         e = fFrame->fExtra[groupOffset+1];
  1164     if (s < 0) {
  1165         // A capture group wasn't part of the match
  1166         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
  1168     U_ASSERT(s <= e);
  1169     group_len = e - s;
  1171     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
  1172     if (dest)
  1173         UTEXT_SETNATIVEINDEX(dest, s);
  1174     return dest;
  1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
  1178     UnicodeString result;
  1179     if (U_FAILURE(status)) {
  1180         return result;
  1182     UText resultText = UTEXT_INITIALIZER;
  1183     utext_openUnicodeString(&resultText, &result, &status);
  1184     group(groupNum, &resultText, status);
  1185     utext_close(&resultText);
  1186     return result;
  1190 //  Return deep (mutable) clone
  1191 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
  1192 //		using this function.
  1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
  1194     UBool bailOut = FALSE;
  1195     if (U_FAILURE(status)) {
  1196         return dest;
  1198     if (U_FAILURE(fDeferredStatus)) {
  1199         status = fDeferredStatus;
  1200         bailOut = TRUE;
  1203     if (fMatch == FALSE) {
  1204         status = U_REGEX_INVALID_STATE;
  1205         bailOut = TRUE;
  1207     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1208         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1209         bailOut = TRUE;
  1212     if (bailOut) {
  1213         if (dest) {
  1214             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1215             return dest;
  1216         } else {
  1217             return utext_openUChars(NULL, NULL, 0, &status);
  1221     int64_t s, e;
  1222     if (groupNum == 0) {
  1223         s = fMatchStart;
  1224         e = fMatchEnd;
  1225     } else {
  1226         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1227         U_ASSERT(groupOffset < fPattern->fFrameSize);
  1228         U_ASSERT(groupOffset >= 0);
  1229         s = fFrame->fExtra[groupOffset];
  1230         e = fFrame->fExtra[groupOffset+1];
  1233     if (s < 0) {
  1234         // A capture group wasn't part of the match 
  1235         if (dest) {
  1236             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1237             return dest;
  1238         } else {
  1239             return utext_openUChars(NULL, NULL, 0, &status);
  1242     U_ASSERT(s <= e);
  1244     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1245         U_ASSERT(e <= fInputLength);
  1246         if (dest) {
  1247             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
  1248         } else {
  1249             UText groupText = UTEXT_INITIALIZER;
  1250             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
  1251             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
  1252             utext_close(&groupText);
  1254     } else {
  1255         int32_t len16;
  1256         if (UTEXT_USES_U16(fInputText)) {
  1257             len16 = (int32_t)(e-s);
  1258         } else {
  1259             UErrorCode lengthStatus = U_ZERO_ERROR;
  1260             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
  1262         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
  1263         if (groupChars == NULL) {
  1264             status = U_MEMORY_ALLOCATION_ERROR;
  1265             return dest;
  1267         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
  1269         if (dest) {
  1270             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
  1271         } else {
  1272             UText groupText = UTEXT_INITIALIZER;
  1273             utext_openUChars(&groupText, groupChars, len16, &status);
  1274             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
  1275             utext_close(&groupText);
  1278         uprv_free(groupChars);
  1280     return dest;
  1283 //--------------------------------------------------------------------------------
  1284 //
  1285 //  appendGroup() -- currently internal only, appends a group to a UText rather
  1286 //                   than replacing its contents
  1287 //
  1288 //--------------------------------------------------------------------------------
  1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
  1291     if (U_FAILURE(status)) {
  1292         return 0;
  1294     if (U_FAILURE(fDeferredStatus)) {
  1295         status = fDeferredStatus;
  1296         return 0;
  1298     int64_t destLen = utext_nativeLength(dest);
  1300     if (fMatch == FALSE) {
  1301         status = U_REGEX_INVALID_STATE;
  1302         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1304     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1305         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1306         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1309     int64_t s, e;
  1310     if (groupNum == 0) {
  1311         s = fMatchStart;
  1312         e = fMatchEnd;
  1313     } else {
  1314         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1315         U_ASSERT(groupOffset < fPattern->fFrameSize);
  1316         U_ASSERT(groupOffset >= 0);
  1317         s = fFrame->fExtra[groupOffset];
  1318         e = fFrame->fExtra[groupOffset+1];
  1321     if (s < 0) {
  1322         // A capture group wasn't part of the match 
  1323         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1325     U_ASSERT(s <= e);
  1327     int64_t deltaLen;
  1328     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1329         U_ASSERT(e <= fInputLength);
  1330         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
  1331     } else {
  1332         int32_t len16;
  1333         if (UTEXT_USES_U16(fInputText)) {
  1334             len16 = (int32_t)(e-s);
  1335         } else {
  1336             UErrorCode lengthStatus = U_ZERO_ERROR;
  1337             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
  1339         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
  1340         if (groupChars == NULL) {
  1341             status = U_MEMORY_ALLOCATION_ERROR;
  1342             return 0;
  1344         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
  1346         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
  1347         uprv_free(groupChars);
  1349     return deltaLen;
  1354 //--------------------------------------------------------------------------------
  1355 //
  1356 //  groupCount()
  1357 //
  1358 //--------------------------------------------------------------------------------
  1359 int32_t RegexMatcher::groupCount() const {
  1360     return fPattern->fGroupMap->size();
  1365 //--------------------------------------------------------------------------------
  1366 //
  1367 //  hasAnchoringBounds()
  1368 //
  1369 //--------------------------------------------------------------------------------
  1370 UBool RegexMatcher::hasAnchoringBounds() const {
  1371     return fAnchoringBounds;
  1375 //--------------------------------------------------------------------------------
  1376 //
  1377 //  hasTransparentBounds()
  1378 //
  1379 //--------------------------------------------------------------------------------
  1380 UBool RegexMatcher::hasTransparentBounds() const {
  1381     return fTransparentBounds;
  1386 //--------------------------------------------------------------------------------
  1387 //
  1388 //  hitEnd()
  1389 //
  1390 //--------------------------------------------------------------------------------
  1391 UBool RegexMatcher::hitEnd() const {
  1392     return fHitEnd;
  1396 //--------------------------------------------------------------------------------
  1397 //
  1398 //  input()
  1399 //
  1400 //--------------------------------------------------------------------------------
  1401 const UnicodeString &RegexMatcher::input() const {
  1402     if (!fInput) {
  1403         UErrorCode status = U_ZERO_ERROR;
  1404         int32_t len16;
  1405         if (UTEXT_USES_U16(fInputText)) {
  1406             len16 = (int32_t)fInputLength;
  1407         } else {
  1408             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
  1409             status = U_ZERO_ERROR; // overflow, length status
  1411         UnicodeString *result = new UnicodeString(len16, 0, 0);
  1413         UChar *inputChars = result->getBuffer(len16);
  1414         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
  1415         result->releaseBuffer(len16);
  1417         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
  1420     return *fInput;
  1423 //--------------------------------------------------------------------------------
  1424 //
  1425 //  inputText()
  1426 //
  1427 //--------------------------------------------------------------------------------
  1428 UText *RegexMatcher::inputText() const {
  1429     return fInputText;
  1433 //--------------------------------------------------------------------------------
  1434 //
  1435 //  getInput() -- like inputText(), but makes a clone or copies into another UText
  1436 //
  1437 //--------------------------------------------------------------------------------
  1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
  1439     UBool bailOut = FALSE;
  1440     if (U_FAILURE(status)) {
  1441         return dest;
  1443     if (U_FAILURE(fDeferredStatus)) {
  1444         status = fDeferredStatus;
  1445         bailOut = TRUE;
  1448     if (bailOut) {
  1449         if (dest) {
  1450             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1451             return dest;
  1452         } else {
  1453             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
  1457     if (dest) {
  1458         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1459             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
  1460         } else {
  1461             int32_t input16Len;
  1462             if (UTEXT_USES_U16(fInputText)) {
  1463                 input16Len = (int32_t)fInputLength;
  1464             } else {
  1465                 UErrorCode lengthStatus = U_ZERO_ERROR;
  1466                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
  1468             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
  1469             if (inputChars == NULL) {
  1470                 return dest;
  1473             status = U_ZERO_ERROR;
  1474             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
  1475             status = U_ZERO_ERROR;
  1476             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
  1478             uprv_free(inputChars);
  1480         return dest;
  1481     } else {
  1482         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
  1487 static UBool compat_SyncMutableUTextContents(UText *ut);
  1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
  1489     UBool retVal = FALSE;
  1491     //  In the following test, we're really only interested in whether the UText should switch
  1492     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
  1493     //  will still point to the correct data.
  1494     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
  1495         UnicodeString *us=(UnicodeString *)ut->context;
  1497         // Update to the latest length.
  1498         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
  1499         int32_t newLength = us->length();
  1501         // Update the chunk description.
  1502         // The buffer may have switched between stack- and heap-based.
  1503         ut->chunkContents    = us->getBuffer();
  1504         ut->chunkLength      = newLength;
  1505         ut->chunkNativeLimit = newLength;
  1506         ut->nativeIndexingLimit = newLength;
  1507         retVal = TRUE;
  1510     return retVal;
  1513 //--------------------------------------------------------------------------------
  1514 //
  1515 //  lookingAt()
  1516 //
  1517 //--------------------------------------------------------------------------------
  1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
  1519     if (U_FAILURE(status)) {
  1520         return FALSE;
  1522     if (U_FAILURE(fDeferredStatus)) {
  1523         status = fDeferredStatus;
  1524         return FALSE;
  1527     if (fInputUniStrMaybeMutable) {
  1528         if (compat_SyncMutableUTextContents(fInputText)) {
  1529         fInputLength = utext_nativeLength(fInputText);
  1530         reset();
  1533     else {
  1534         resetPreserveRegion();
  1536     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1537         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
  1538     } else {
  1539         MatchAt(fActiveStart, FALSE, status);
  1541     return fMatch;
  1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
  1546     if (U_FAILURE(status)) {
  1547         return FALSE;
  1549     if (U_FAILURE(fDeferredStatus)) {
  1550         status = fDeferredStatus;
  1551         return FALSE;
  1553     reset();
  1555     if (start < 0) {
  1556         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1557         return FALSE;
  1560     if (fInputUniStrMaybeMutable) {
  1561         if (compat_SyncMutableUTextContents(fInputText)) {
  1562         fInputLength = utext_nativeLength(fInputText);
  1563         reset();
  1567     int64_t nativeStart;
  1568     nativeStart = start;
  1569     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
  1570         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1571         return FALSE;
  1574     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1575         MatchChunkAt((int32_t)nativeStart, FALSE, status);
  1576     } else {
  1577         MatchAt(nativeStart, FALSE, status);
  1579     return fMatch;
  1584 //--------------------------------------------------------------------------------
  1585 //
  1586 //  matches()
  1587 //
  1588 //--------------------------------------------------------------------------------
  1589 UBool RegexMatcher::matches(UErrorCode &status) {
  1590     if (U_FAILURE(status)) {
  1591         return FALSE;
  1593     if (U_FAILURE(fDeferredStatus)) {
  1594         status = fDeferredStatus;
  1595         return FALSE;
  1598     if (fInputUniStrMaybeMutable) {
  1599         if (compat_SyncMutableUTextContents(fInputText)) {
  1600         fInputLength = utext_nativeLength(fInputText);
  1601         reset();
  1604     else {
  1605         resetPreserveRegion();
  1608     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1609         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
  1610     } else {
  1611         MatchAt(fActiveStart, TRUE, status);
  1613     return fMatch;
  1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
  1618     if (U_FAILURE(status)) {
  1619         return FALSE;
  1621     if (U_FAILURE(fDeferredStatus)) {
  1622         status = fDeferredStatus;
  1623         return FALSE;
  1625     reset();
  1627     if (start < 0) {
  1628         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1629         return FALSE;
  1632     if (fInputUniStrMaybeMutable) {
  1633         if (compat_SyncMutableUTextContents(fInputText)) {
  1634         fInputLength = utext_nativeLength(fInputText);
  1635         reset();
  1639     int64_t nativeStart;
  1640     nativeStart = start;
  1641     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
  1642         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1643         return FALSE;
  1646     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1647         MatchChunkAt((int32_t)nativeStart, TRUE, status);
  1648     } else {
  1649         MatchAt(nativeStart, TRUE, status);
  1651     return fMatch;
  1656 //--------------------------------------------------------------------------------
  1657 //
  1658 //    pattern
  1659 //
  1660 //--------------------------------------------------------------------------------
  1661 const RegexPattern &RegexMatcher::pattern() const {
  1662     return *fPattern;
  1667 //--------------------------------------------------------------------------------
  1668 //
  1669 //    region
  1670 //
  1671 //--------------------------------------------------------------------------------
  1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
  1673     if (U_FAILURE(status)) {
  1674         return *this;
  1677     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
  1678         status = U_ILLEGAL_ARGUMENT_ERROR;
  1681     int64_t nativeStart = regionStart;
  1682     int64_t nativeLimit = regionLimit;
  1683     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
  1684       status = U_ILLEGAL_ARGUMENT_ERROR;
  1687     if (startIndex == -1)
  1688       this->reset();
  1689     else
  1690       resetPreserveRegion();    
  1692     fRegionStart = nativeStart;
  1693     fRegionLimit = nativeLimit;
  1694     fActiveStart = nativeStart;
  1695     fActiveLimit = nativeLimit;
  1697     if (startIndex != -1) {
  1698       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
  1699           status = U_INDEX_OUTOFBOUNDS_ERROR;
  1701       fMatchEnd = startIndex;  
  1704     if (!fTransparentBounds) {
  1705         fLookStart = nativeStart;
  1706         fLookLimit = nativeLimit;
  1708     if (fAnchoringBounds) {
  1709         fAnchorStart = nativeStart;
  1710         fAnchorLimit = nativeLimit;
  1712     return *this;
  1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
  1716   return region(start, limit, -1, status);
  1719 //--------------------------------------------------------------------------------
  1720 //
  1721 //    regionEnd
  1722 //
  1723 //--------------------------------------------------------------------------------
  1724 int32_t RegexMatcher::regionEnd() const {
  1725     return (int32_t)fRegionLimit;
  1728 int64_t RegexMatcher::regionEnd64() const {
  1729     return fRegionLimit;
  1732 //--------------------------------------------------------------------------------
  1733 //
  1734 //    regionStart
  1735 //
  1736 //--------------------------------------------------------------------------------
  1737 int32_t RegexMatcher::regionStart() const {
  1738     return (int32_t)fRegionStart;
  1741 int64_t RegexMatcher::regionStart64() const {
  1742     return fRegionStart;
  1746 //--------------------------------------------------------------------------------
  1747 //
  1748 //    replaceAll
  1749 //
  1750 //--------------------------------------------------------------------------------
  1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
  1752     UText replacementText = UTEXT_INITIALIZER;
  1753     UText resultText = UTEXT_INITIALIZER;
  1754     UnicodeString resultString;
  1755     if (U_FAILURE(status)) {
  1756         return resultString;
  1759     utext_openConstUnicodeString(&replacementText, &replacement, &status);
  1760     utext_openUnicodeString(&resultText, &resultString, &status);
  1762     replaceAll(&replacementText, &resultText, status);
  1764     utext_close(&resultText);
  1765     utext_close(&replacementText);
  1767     return resultString;
  1771 //
  1772 //    replaceAll, UText mode
  1773 //
  1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
  1775     if (U_FAILURE(status)) {
  1776         return dest;
  1778     if (U_FAILURE(fDeferredStatus)) {
  1779         status = fDeferredStatus;
  1780         return dest;
  1783     if (dest == NULL) {
  1784         UnicodeString emptyString;
  1785         UText empty = UTEXT_INITIALIZER;
  1787         utext_openUnicodeString(&empty, &emptyString, &status);
  1788         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
  1789         utext_close(&empty);
  1792     if (U_SUCCESS(status)) {
  1793         reset();
  1794         while (find()) {
  1795             appendReplacement(dest, replacement, status);
  1796             if (U_FAILURE(status)) {
  1797                 break;
  1800         appendTail(dest, status);
  1803     return dest;
  1807 //--------------------------------------------------------------------------------
  1808 //
  1809 //    replaceFirst
  1810 //
  1811 //--------------------------------------------------------------------------------
  1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
  1813     UText replacementText = UTEXT_INITIALIZER;
  1814     UText resultText = UTEXT_INITIALIZER;
  1815     UnicodeString resultString;
  1817     utext_openConstUnicodeString(&replacementText, &replacement, &status);
  1818     utext_openUnicodeString(&resultText, &resultString, &status);
  1820     replaceFirst(&replacementText, &resultText, status);
  1822     utext_close(&resultText);
  1823     utext_close(&replacementText);
  1825     return resultString;
  1828 //
  1829 //    replaceFirst, UText mode
  1830 //
  1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
  1832     if (U_FAILURE(status)) {
  1833         return dest;
  1835     if (U_FAILURE(fDeferredStatus)) {
  1836         status = fDeferredStatus;
  1837         return dest;
  1840     reset();
  1841     if (!find()) {
  1842         return getInput(dest, status);
  1845     if (dest == NULL) {
  1846         UnicodeString emptyString;
  1847         UText empty = UTEXT_INITIALIZER;
  1849         utext_openUnicodeString(&empty, &emptyString, &status);
  1850         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
  1851         utext_close(&empty);
  1854     appendReplacement(dest, replacement, status);
  1855     appendTail(dest, status);
  1857     return dest;
  1861 //--------------------------------------------------------------------------------
  1862 //
  1863 //     requireEnd
  1864 //
  1865 //--------------------------------------------------------------------------------
  1866 UBool RegexMatcher::requireEnd() const {
  1867     return fRequireEnd;
  1871 //--------------------------------------------------------------------------------
  1872 //
  1873 //     reset
  1874 //
  1875 //--------------------------------------------------------------------------------
  1876 RegexMatcher &RegexMatcher::reset() {
  1877     fRegionStart    = 0;
  1878     fRegionLimit    = fInputLength;
  1879     fActiveStart    = 0;
  1880     fActiveLimit    = fInputLength;
  1881     fAnchorStart    = 0;
  1882     fAnchorLimit    = fInputLength;
  1883     fLookStart      = 0;
  1884     fLookLimit      = fInputLength;
  1885     resetPreserveRegion();
  1886     return *this;
  1891 void RegexMatcher::resetPreserveRegion() {
  1892     fMatchStart     = 0;
  1893     fMatchEnd       = 0;
  1894     fLastMatchEnd   = -1;
  1895     fAppendPosition = 0;
  1896     fMatch          = FALSE;
  1897     fHitEnd         = FALSE;
  1898     fRequireEnd     = FALSE;
  1899     fTime           = 0;
  1900     fTickCounter    = TIMER_INITIAL_VALUE;
  1901     //resetStack(); // more expensive than it looks...
  1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
  1906     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
  1907     if (fPattern->fNeedsAltInput) {
  1908         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
  1910     fInputLength = utext_nativeLength(fInputText);
  1912     reset();
  1913     delete fInput;
  1914     fInput = NULL;
  1916     //  Do the following for any UnicodeString.
  1917     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
  1918     fInputUniStrMaybeMutable = TRUE;    
  1920     if (fWordBreakItr != NULL) {
  1921 #if UCONFIG_NO_BREAK_ITERATION==0
  1922         UErrorCode status = U_ZERO_ERROR;
  1923         fWordBreakItr->setText(fInputText, status);
  1924 #endif
  1926     return *this;
  1930 RegexMatcher &RegexMatcher::reset(UText *input) {
  1931     if (fInputText != input) {
  1932         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
  1933         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
  1934         fInputLength = utext_nativeLength(fInputText);
  1936         delete fInput;
  1937         fInput = NULL;
  1939         if (fWordBreakItr != NULL) {
  1940 #if UCONFIG_NO_BREAK_ITERATION==0
  1941             UErrorCode status = U_ZERO_ERROR;
  1942             fWordBreakItr->setText(input, status);
  1943 #endif
  1946     reset();
  1947     fInputUniStrMaybeMutable = FALSE;
  1949     return *this;
  1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
  1953     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
  1954     return *this;
  1955 }*/
  1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
  1958     if (U_FAILURE(status)) {
  1959         return *this;
  1961     reset();       // Reset also resets the region to be the entire string.
  1963     if (position < 0 || position > fActiveLimit) {
  1964         status = U_INDEX_OUTOFBOUNDS_ERROR;
  1965         return *this;
  1967     fMatchEnd = position;
  1968     return *this;
  1972 //--------------------------------------------------------------------------------
  1973 //
  1974 //    refresh
  1975 //
  1976 //--------------------------------------------------------------------------------
  1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
  1978     if (U_FAILURE(status)) {
  1979         return *this;
  1981     if (input == NULL) {
  1982         status = U_ILLEGAL_ARGUMENT_ERROR;
  1983         return *this;
  1985     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
  1986         status = U_ILLEGAL_ARGUMENT_ERROR;
  1987         return *this;
  1989     int64_t  pos = utext_getNativeIndex(fInputText);
  1990     //  Shallow read-only clone of the new UText into the existing input UText
  1991     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
  1992     if (U_FAILURE(status)) {
  1993         return *this;
  1995     utext_setNativeIndex(fInputText, pos);
  1997     if (fAltInputText != NULL) {
  1998         pos = utext_getNativeIndex(fAltInputText);
  1999         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
  2000         if (U_FAILURE(status)) {
  2001             return *this;
  2003         utext_setNativeIndex(fAltInputText, pos);
  2005     return *this;
  2010 //--------------------------------------------------------------------------------
  2011 //
  2012 //    setTrace
  2013 //
  2014 //--------------------------------------------------------------------------------
  2015 void RegexMatcher::setTrace(UBool state) {
  2016     fTraceDebug = state;
  2021 //---------------------------------------------------------------------
  2022 //
  2023 //   split
  2024 //
  2025 //---------------------------------------------------------------------
  2026 int32_t  RegexMatcher::split(const UnicodeString &input,
  2027         UnicodeString    dest[],
  2028         int32_t          destCapacity,
  2029         UErrorCode      &status)
  2031     UText inputText = UTEXT_INITIALIZER;
  2032     utext_openConstUnicodeString(&inputText, &input, &status);
  2033     if (U_FAILURE(status)) {
  2034         return 0;
  2037     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
  2038     if (destText == NULL) {
  2039         status = U_MEMORY_ALLOCATION_ERROR;
  2040         return 0;
  2042     int32_t i;
  2043     for (i = 0; i < destCapacity; i++) {
  2044         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
  2047     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
  2049     for (i = 0; i < destCapacity; i++) {
  2050         utext_close(destText[i]);
  2053     uprv_free(destText);
  2054     utext_close(&inputText);
  2055     return fieldCount;
  2058 //
  2059 //   split, UText mode
  2060 //
  2061 int32_t  RegexMatcher::split(UText *input,
  2062         UText           *dest[],
  2063         int32_t          destCapacity,
  2064         UErrorCode      &status)
  2066     //
  2067     // Check arguements for validity
  2068     //
  2069     if (U_FAILURE(status)) {
  2070         return 0;
  2071     };
  2073     if (destCapacity < 1) {
  2074         status = U_ILLEGAL_ARGUMENT_ERROR;
  2075         return 0;
  2078     //
  2079     // Reset for the input text
  2080     //
  2081     reset(input);
  2082     int64_t   nextOutputStringStart = 0;
  2083     if (fActiveLimit == 0) {
  2084         return 0;
  2087     //
  2088     // Loop through the input text, searching for the delimiter pattern
  2089     //
  2090     int32_t i;
  2091     int32_t numCaptureGroups = fPattern->fGroupMap->size();
  2092     for (i=0; ; i++) {
  2093         if (i>=destCapacity-1) {
  2094             // There is one or zero output string left.
  2095             // Fill the last output string with whatever is left from the input, then exit the loop.
  2096             //  ( i will be == destCapacity if we filled the output array while processing
  2097             //    capture groups of the delimiter expression, in which case we will discard the
  2098             //    last capture group saved in favor of the unprocessed remainder of the
  2099             //    input string.)
  2100             i = destCapacity-1;
  2101             if (fActiveLimit > nextOutputStringStart) {
  2102                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  2103                     if (dest[i]) {
  2104                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  2105                                       input->chunkContents+nextOutputStringStart, 
  2106                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
  2107                     } else {
  2108                         UText remainingText = UTEXT_INITIALIZER;
  2109                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  2110                                          fActiveLimit-nextOutputStringStart, &status);
  2111                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2112                         utext_close(&remainingText);
  2114                 } else {
  2115                     UErrorCode lengthStatus = U_ZERO_ERROR;
  2116                     int32_t remaining16Length = 
  2117                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
  2118                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  2119                     if (remainingChars == NULL) {
  2120                         status = U_MEMORY_ALLOCATION_ERROR;
  2121                         break;
  2124                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
  2125                     if (dest[i]) {
  2126                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  2127                     } else {
  2128                         UText remainingText = UTEXT_INITIALIZER;
  2129                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  2130                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2131                         utext_close(&remainingText);
  2134                     uprv_free(remainingChars);
  2137             break;
  2139         if (find()) {
  2140             // We found another delimiter.  Move everything from where we started looking
  2141             //  up until the start of the delimiter into the next output string.
  2142             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  2143                 if (dest[i]) {
  2144                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  2145                                   input->chunkContents+nextOutputStringStart, 
  2146                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
  2147                 } else {
  2148                     UText remainingText = UTEXT_INITIALIZER;
  2149                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  2150                                       fMatchStart-nextOutputStringStart, &status);
  2151                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2152                     utext_close(&remainingText);
  2154             } else {
  2155                 UErrorCode lengthStatus = U_ZERO_ERROR;
  2156                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
  2157                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  2158                 if (remainingChars == NULL) {
  2159                     status = U_MEMORY_ALLOCATION_ERROR;
  2160                     break;
  2162                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
  2163                 if (dest[i]) {
  2164                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  2165                 } else {
  2166                     UText remainingText = UTEXT_INITIALIZER;
  2167                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  2168                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2169                     utext_close(&remainingText);
  2172                 uprv_free(remainingChars);
  2174             nextOutputStringStart = fMatchEnd;
  2176             // If the delimiter pattern has capturing parentheses, the captured
  2177             //  text goes out into the next n destination strings.
  2178             int32_t groupNum;
  2179             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
  2180                 if (i >= destCapacity-2) {
  2181                     // Never fill the last available output string with capture group text.
  2182                     // It will filled with the last field, the remainder of the
  2183                     //  unsplit input text.
  2184                     break;
  2186                 i++;
  2187                 dest[i] = group(groupNum, dest[i], status);
  2190             if (nextOutputStringStart == fActiveLimit) {
  2191                 // The delimiter was at the end of the string.  We're done, but first
  2192                 // we output one last empty string, for the empty field following
  2193                 //   the delimiter at the end of input.
  2194                 if (i+1 < destCapacity) {
  2195                     ++i;
  2196                     if (dest[i] == NULL) {
  2197                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
  2198                     } else {
  2199                         static UChar emptyString[] = {(UChar)0};
  2200                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
  2203                 break;
  2207         else
  2209             // We ran off the end of the input while looking for the next delimiter.
  2210             // All the remaining text goes into the current output string.
  2211             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  2212                 if (dest[i]) {
  2213                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  2214                                   input->chunkContents+nextOutputStringStart, 
  2215                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
  2216                 } else {
  2217                     UText remainingText = UTEXT_INITIALIZER;
  2218                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  2219                                      fActiveLimit-nextOutputStringStart, &status);
  2220                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2221                     utext_close(&remainingText);
  2223             } else {
  2224                 UErrorCode lengthStatus = U_ZERO_ERROR;
  2225                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
  2226                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  2227                 if (remainingChars == NULL) {
  2228                     status = U_MEMORY_ALLOCATION_ERROR;
  2229                     break;
  2232                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
  2233                 if (dest[i]) {
  2234                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  2235                 } else {
  2236                     UText remainingText = UTEXT_INITIALIZER;
  2237                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  2238                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  2239                     utext_close(&remainingText);
  2242                 uprv_free(remainingChars);
  2244             break;
  2246         if (U_FAILURE(status)) {
  2247             break;
  2249     }   // end of for loop
  2250     return i+1;
  2254 //--------------------------------------------------------------------------------
  2255 //
  2256 //     start
  2257 //
  2258 //--------------------------------------------------------------------------------
  2259 int32_t RegexMatcher::start(UErrorCode &status) const {
  2260     return start(0, status);
  2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
  2264     return start64(0, status);
  2267 //--------------------------------------------------------------------------------
  2268 //
  2269 //     start(int32_t group, UErrorCode &status)
  2270 //
  2271 //--------------------------------------------------------------------------------
  2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
  2274     if (U_FAILURE(status)) {
  2275         return -1;
  2277     if (U_FAILURE(fDeferredStatus)) {
  2278         status = fDeferredStatus;
  2279         return -1;
  2281     if (fMatch == FALSE) {
  2282         status = U_REGEX_INVALID_STATE;
  2283         return -1;
  2285     if (group < 0 || group > fPattern->fGroupMap->size()) {
  2286         status = U_INDEX_OUTOFBOUNDS_ERROR;
  2287         return -1;
  2289     int64_t s;
  2290     if (group == 0) {
  2291         s = fMatchStart; 
  2292     } else {
  2293         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
  2294         U_ASSERT(groupOffset < fPattern->fFrameSize);
  2295         U_ASSERT(groupOffset >= 0);
  2296         s = fFrame->fExtra[groupOffset];
  2299     return s;
  2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
  2304     return (int32_t)start64(group, status);
  2307 //--------------------------------------------------------------------------------
  2308 //
  2309 //     useAnchoringBounds
  2310 //
  2311 //--------------------------------------------------------------------------------
  2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
  2313     fAnchoringBounds = b;
  2314     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
  2315     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
  2316     return *this;
  2320 //--------------------------------------------------------------------------------
  2321 //
  2322 //     useTransparentBounds
  2323 //
  2324 //--------------------------------------------------------------------------------
  2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
  2326     fTransparentBounds = b;
  2327     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
  2328     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
  2329     return *this;
  2332 //--------------------------------------------------------------------------------
  2333 //
  2334 //     setTimeLimit
  2335 //
  2336 //--------------------------------------------------------------------------------
  2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
  2338     if (U_FAILURE(status)) {
  2339         return;
  2341     if (U_FAILURE(fDeferredStatus)) {
  2342         status = fDeferredStatus;
  2343         return;
  2345     if (limit < 0) {
  2346         status = U_ILLEGAL_ARGUMENT_ERROR;
  2347         return;
  2349     fTimeLimit = limit;
  2353 //--------------------------------------------------------------------------------
  2354 //
  2355 //     getTimeLimit
  2356 //
  2357 //--------------------------------------------------------------------------------
  2358 int32_t RegexMatcher::getTimeLimit() const {
  2359     return fTimeLimit;
  2363 //--------------------------------------------------------------------------------
  2364 //
  2365 //     setStackLimit
  2366 //
  2367 //--------------------------------------------------------------------------------
  2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
  2369     if (U_FAILURE(status)) {
  2370         return;
  2372     if (U_FAILURE(fDeferredStatus)) {
  2373         status = fDeferredStatus;
  2374         return;
  2376     if (limit < 0) {
  2377         status = U_ILLEGAL_ARGUMENT_ERROR;
  2378         return;
  2381     // Reset the matcher.  This is needed here in case there is a current match
  2382     //    whose final stack frame (containing the match results, pointed to by fFrame) 
  2383     //    would be lost by resizing to a smaller stack size.
  2384     reset();
  2386     if (limit == 0) {
  2387         // Unlimited stack expansion
  2388         fStack->setMaxCapacity(0);
  2389     } else {
  2390         // Change the units of the limit  from bytes to ints, and bump the size up
  2391         //   to be big enough to hold at least one stack frame for the pattern, 
  2392         //   if it isn't there already.
  2393         int32_t adjustedLimit = limit / sizeof(int32_t);
  2394         if (adjustedLimit < fPattern->fFrameSize) {
  2395             adjustedLimit = fPattern->fFrameSize;
  2397         fStack->setMaxCapacity(adjustedLimit);
  2399     fStackLimit = limit;
  2403 //--------------------------------------------------------------------------------
  2404 //
  2405 //     getStackLimit
  2406 //
  2407 //--------------------------------------------------------------------------------
  2408 int32_t RegexMatcher::getStackLimit() const {
  2409     return fStackLimit;
  2413 //--------------------------------------------------------------------------------
  2414 //
  2415 //     setMatchCallback
  2416 //
  2417 //--------------------------------------------------------------------------------
  2418 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
  2419                                     const void              *context,
  2420                                     UErrorCode              &status) {
  2421     if (U_FAILURE(status)) {
  2422         return;
  2424     fCallbackFn = callback;
  2425     fCallbackContext = context;
  2429 //--------------------------------------------------------------------------------
  2430 //
  2431 //     getMatchCallback
  2432 //
  2433 //--------------------------------------------------------------------------------
  2434 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
  2435                                   const void              *&context,
  2436                                   UErrorCode              &status) {
  2437     if (U_FAILURE(status)) {
  2438        return;
  2440     callback = fCallbackFn;
  2441     context  = fCallbackContext;
  2445 //--------------------------------------------------------------------------------
  2446 //
  2447 //     setMatchCallback
  2448 //
  2449 //--------------------------------------------------------------------------------
  2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
  2451                                                 const void                      *context,
  2452                                                 UErrorCode                      &status) {
  2453     if (U_FAILURE(status)) {
  2454         return;
  2456     fFindProgressCallbackFn = callback;
  2457     fFindProgressCallbackContext = context;
  2461 //--------------------------------------------------------------------------------
  2462 //
  2463 //     getMatchCallback
  2464 //
  2465 //--------------------------------------------------------------------------------
  2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
  2467                                                 const void                    *&context,
  2468                                                 UErrorCode                    &status) {
  2469     if (U_FAILURE(status)) {
  2470        return;
  2472     callback = fFindProgressCallbackFn;
  2473     context  = fFindProgressCallbackContext;
  2477 //================================================================================
  2478 //
  2479 //    Code following this point in this file is the internal
  2480 //    Match Engine Implementation.
  2481 //
  2482 //================================================================================
  2485 //--------------------------------------------------------------------------------
  2486 //
  2487 //   resetStack
  2488 //           Discard any previous contents of the state save stack, and initialize a
  2489 //           new stack frame to all -1.  The -1s are needed for capture group limits, 
  2490 //           where they indicate that a group has not yet matched anything.
  2491 //--------------------------------------------------------------------------------
  2492 REStackFrame *RegexMatcher::resetStack() {
  2493     // Discard any previous contents of the state save stack, and initialize a
  2494     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
  2495     //  where they indicate that a group has not yet matched anything.
  2496     fStack->removeAllElements();
  2498     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
  2499     int32_t i;
  2500     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
  2501         iFrame->fExtra[i] = -1;
  2503     return iFrame;
  2508 //--------------------------------------------------------------------------------
  2509 //
  2510 //   isWordBoundary 
  2511 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
  2512 //                     For us,
  2513 //                       If the current char is a combining mark,
  2514 //                          \b is FALSE.
  2515 //                       Else Scan backwards to the first non-combining char.
  2516 //                            We are at a boundary if the this char and the original chars are
  2517 //                               opposite in membership in \w set
  2518 //
  2519 //          parameters:   pos   - the current position in the input buffer
  2520 //
  2521 //              TODO:  double-check edge cases at region boundaries.
  2522 //
  2523 //--------------------------------------------------------------------------------
  2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
  2525     UBool isBoundary = FALSE;
  2526     UBool cIsWord    = FALSE;
  2528     if (pos >= fLookLimit) {
  2529         fHitEnd = TRUE;
  2530     } else {
  2531         // Determine whether char c at current position is a member of the word set of chars.
  2532         // If we're off the end of the string, behave as though we're not at a word char.
  2533         UTEXT_SETNATIVEINDEX(fInputText, pos);
  2534         UChar32  c = UTEXT_CURRENT32(fInputText);
  2535         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
  2536             // Current char is a combining one.  Not a boundary.
  2537             return FALSE;
  2539         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
  2542     // Back up until we come to a non-combining char, determine whether
  2543     //  that char is a word char.
  2544     UBool prevCIsWord = FALSE;
  2545     for (;;) {
  2546         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
  2547             break;
  2549         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
  2550         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
  2551               || u_charType(prevChar) == U_FORMAT_CHAR)) {
  2552             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
  2553             break;
  2556     isBoundary = cIsWord ^ prevCIsWord;
  2557     return isBoundary;
  2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
  2561     UBool isBoundary = FALSE;
  2562     UBool cIsWord    = FALSE;
  2564     const UChar *inputBuf = fInputText->chunkContents;
  2566     if (pos >= fLookLimit) {
  2567         fHitEnd = TRUE;
  2568     } else {
  2569         // Determine whether char c at current position is a member of the word set of chars.
  2570         // If we're off the end of the string, behave as though we're not at a word char.
  2571         UChar32 c;
  2572         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
  2573         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
  2574             // Current char is a combining one.  Not a boundary.
  2575             return FALSE;
  2577         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
  2580     // Back up until we come to a non-combining char, determine whether
  2581     //  that char is a word char.
  2582     UBool prevCIsWord = FALSE;
  2583     for (;;) {
  2584         if (pos <= fLookStart) {
  2585             break;
  2587         UChar32 prevChar;
  2588         U16_PREV(inputBuf, fLookStart, pos, prevChar);
  2589         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
  2590               || u_charType(prevChar) == U_FORMAT_CHAR)) {
  2591             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
  2592             break;
  2595     isBoundary = cIsWord ^ prevCIsWord;
  2596     return isBoundary;
  2599 //--------------------------------------------------------------------------------
  2600 //
  2601 //   isUWordBoundary 
  2602 //
  2603 //         Test for a word boundary using RBBI word break.
  2604 //
  2605 //          parameters:   pos   - the current position in the input buffer
  2606 //
  2607 //--------------------------------------------------------------------------------
  2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
  2609     UBool       returnVal = FALSE;
  2610 #if UCONFIG_NO_BREAK_ITERATION==0
  2612     // If we haven't yet created a break iterator for this matcher, do it now.
  2613     if (fWordBreakItr == NULL) {
  2614         fWordBreakItr = 
  2615             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
  2616         if (U_FAILURE(fDeferredStatus)) {
  2617             return FALSE;
  2619         fWordBreakItr->setText(fInputText, fDeferredStatus);
  2622     if (pos >= fLookLimit) {
  2623         fHitEnd = TRUE;
  2624         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
  2625                             //    words are not boundaries.  All non-word chars stand by themselves,
  2626                             //    with word boundaries on both sides.
  2627     } else {
  2628         if (!UTEXT_USES_U16(fInputText)) {
  2629             // !!!: Would like a better way to do this!
  2630             UErrorCode status = U_ZERO_ERROR;
  2631             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
  2633         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
  2635 #endif
  2636     return   returnVal;
  2639 //--------------------------------------------------------------------------------
  2640 //
  2641 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
  2642 //                     saves. Increment the "time" counter, and call the
  2643 //                     user callback function if there is one installed.
  2644 //
  2645 //                     If the match operation needs to be aborted, either for a time-out
  2646 //                     or because the user callback asked for it, just set an error status.
  2647 //                     The engine will pick that up and stop in its outer loop.
  2648 //
  2649 //--------------------------------------------------------------------------------
  2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
  2651     fTickCounter = TIMER_INITIAL_VALUE;
  2652     fTime++;
  2653     if (fCallbackFn != NULL) {
  2654         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
  2655             status = U_REGEX_STOPPED_BY_CALLER;
  2656             return;
  2659     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
  2660         status = U_REGEX_TIME_OUT;
  2664 //--------------------------------------------------------------------------------
  2665 //
  2666 //   ReportFindProgress     This function is called once for each advance in the target
  2667 //                          string from the find() function, and calls the user progress callback
  2668 //                          function if there is one installed.
  2669 //                          
  2670 //                          NOTE:  
  2671 //
  2672 //                          If the match operation needs to be aborted because the user
  2673 //                          callback asked for it, just set an error status.
  2674 //                          The engine will pick that up and stop in its outer loop.
  2675 //
  2676 //--------------------------------------------------------------------------------
  2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
  2678     if (fFindProgressCallbackFn != NULL) {
  2679         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
  2680             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
  2681             return FALSE;
  2684     return TRUE;
  2687 //--------------------------------------------------------------------------------
  2688 //
  2689 //   StateSave
  2690 //       Make a new stack frame, initialized as a copy of the current stack frame.
  2691 //       Set the pattern index in the original stack frame from the operand value
  2692 //       in the opcode.  Execution of the engine continues with the state in
  2693 //       the newly created stack frame
  2694 //
  2695 //       Note that reserveBlock() may grow the stack, resulting in the
  2696 //       whole thing being relocated in memory.
  2697 //
  2698 //    Parameters:
  2699 //       fp           The top frame pointer when called.  At return, a new 
  2700 //                    fame will be present
  2701 //       savePatIdx   An index into the compiled pattern.  Goes into the original
  2702 //                    (not new) frame.  If execution ever back-tracks out of the
  2703 //                    new frame, this will be where we continue from in the pattern.
  2704 //    Return
  2705 //                    The new frame pointer.
  2706 //
  2707 //--------------------------------------------------------------------------------
  2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
  2709     // push storage for a new frame. 
  2710     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
  2711     if (newFP == NULL) {
  2712         // Failure on attempted stack expansion.
  2713         //   Stack function set some other error code, change it to a more
  2714         //   specific one for regular expressions.
  2715         status = U_REGEX_STACK_OVERFLOW;
  2716         // We need to return a writable stack frame, so just return the
  2717         //    previous frame.  The match operation will stop quickly
  2718         //    because of the error status, after which the frame will never
  2719         //    be looked at again.
  2720         return fp;
  2722     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
  2724     // New stack frame = copy of old top frame.
  2725     int64_t *source = (int64_t *)fp;
  2726     int64_t *dest   = newFP;
  2727     for (;;) {
  2728         *dest++ = *source++;
  2729         if (source == newFP) {
  2730             break;
  2734     fTickCounter--;
  2735     if (fTickCounter <= 0) {
  2736        IncrementTime(status);    // Re-initializes fTickCounter
  2738     fp->fPatIdx = savePatIdx;
  2739     return (REStackFrame *)newFP;
  2743 //--------------------------------------------------------------------------------
  2744 //
  2745 //   MatchAt      This is the actual matching engine.
  2746 //
  2747 //                  startIdx:    begin matching a this index.
  2748 //                  toEnd:       if true, match must extend to end of the input region
  2749 //
  2750 //--------------------------------------------------------------------------------
  2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
  2752     UBool       isMatch  = FALSE;      // True if the we have a match.
  2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
  2756     int32_t     op;                    // Operation from the compiled pattern, split into
  2757     int32_t     opType;                //    the opcode
  2758     int32_t     opValue;               //    and the operand value.
  2760     #ifdef REGEX_RUN_DEBUG
  2761     if (fTraceDebug)
  2763         printf("MatchAt(startIdx=%ld)\n", startIdx);
  2764         printf("Original Pattern: ");
  2765         UChar32 c = utext_next32From(fPattern->fPattern, 0);
  2766         while (c != U_SENTINEL) {
  2767             if (c<32 || c>256) {
  2768                 c = '.';
  2770             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
  2772             c = UTEXT_NEXT32(fPattern->fPattern);
  2774         printf("\n");
  2775         printf("Input String: ");
  2776         c = utext_next32From(fInputText, 0);
  2777         while (c != U_SENTINEL) {
  2778             if (c<32 || c>256) {
  2779                 c = '.';
  2781             printf("%c", c);
  2783             c = UTEXT_NEXT32(fInputText);
  2785         printf("\n");
  2786         printf("\n");
  2788     #endif
  2790     if (U_FAILURE(status)) {
  2791         return;
  2794     //  Cache frequently referenced items from the compiled pattern
  2795     //
  2796     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
  2798     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
  2799     UVector             *sets          = fPattern->fSets;
  2801     fFrameSize = fPattern->fFrameSize;
  2802     REStackFrame        *fp            = resetStack();
  2804     fp->fPatIdx   = 0;
  2805     fp->fInputIdx = startIdx;
  2807     // Zero out the pattern's static data
  2808     int32_t i;
  2809     for (i = 0; i<fPattern->fDataSize; i++) {
  2810         fData[i] = 0;
  2813     //
  2814     //  Main loop for interpreting the compiled pattern.
  2815     //  One iteration of the loop per pattern operation performed.
  2816     //
  2817     for (;;) {
  2818 #if 0
  2819         if (_heapchk() != _HEAPOK) {
  2820             fprintf(stderr, "Heap Trouble\n");
  2822 #endif
  2824         op      = (int32_t)pat[fp->fPatIdx];
  2825         opType  = URX_TYPE(op);
  2826         opValue = URX_VAL(op);
  2827         #ifdef REGEX_RUN_DEBUG
  2828         if (fTraceDebug) {
  2829             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  2830             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
  2831                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
  2832             fPattern->dumpOp(fp->fPatIdx);
  2834         #endif
  2835         fp->fPatIdx++;
  2837         switch (opType) {
  2840         case URX_NOP:
  2841             break;
  2844         case URX_BACKTRACK:
  2845             // Force a backtrack.  In some circumstances, the pattern compiler
  2846             //   will notice that the pattern can't possibly match anything, and will
  2847             //   emit one of these at that point.
  2848             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  2849             break;
  2852         case URX_ONECHAR:
  2853             if (fp->fInputIdx < fActiveLimit) {
  2854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  2855                 UChar32 c = UTEXT_NEXT32(fInputText);
  2856                 if (c == opValue) {
  2857                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  2858                     break;
  2860             } else {
  2861                 fHitEnd = TRUE;
  2863             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  2864             break;
  2867         case URX_STRING:
  2869                 // Test input against a literal string.
  2870                 // Strings require two slots in the compiled pattern, one for the
  2871                 //   offset to the string text, and one for the length.
  2873                 int32_t   stringStartIdx = opValue;
  2874                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
  2875                 fp->fPatIdx++;
  2876                 opType    = URX_TYPE(op);
  2877                 int32_t stringLen = URX_VAL(op);
  2878                 U_ASSERT(opType == URX_STRING_LEN);
  2879                 U_ASSERT(stringLen >= 2);
  2881                 const UChar *patternString = litText+stringStartIdx;
  2882                 int32_t patternStringIndex = 0;
  2883                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  2884                 UChar32 inputChar;
  2885                 UChar32 patternChar;
  2886                 UBool success = TRUE;
  2887                 while (patternStringIndex < stringLen) {
  2888                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
  2889                         success = FALSE;
  2890                         fHitEnd = TRUE;
  2891                         break;
  2893                     inputChar = UTEXT_NEXT32(fInputText);
  2894                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
  2895                     if (patternChar != inputChar) {
  2896                         success = FALSE;
  2897                         break;
  2901                 if (success) {
  2902                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  2903                 } else {
  2904                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  2907             break;
  2910         case URX_STATE_SAVE:
  2911             fp = StateSave(fp, opValue, status);
  2912             break;
  2915         case URX_END:
  2916             // The match loop will exit via this path on a successful match,
  2917             //   when we reach the end of the pattern.
  2918             if (toEnd && fp->fInputIdx != fActiveLimit) {
  2919                 // The pattern matched, but not to the end of input.  Try some more.
  2920                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  2921                 break;
  2923             isMatch = TRUE;
  2924             goto  breakFromLoop;
  2926         // Start and End Capture stack frame variables are laid out out like this:
  2927             //  fp->fExtra[opValue]  - The start of a completed capture group
  2928             //             opValue+1 - The end   of a completed capture group
  2929             //             opValue+2 - the start of a capture group whose end
  2930             //                          has not yet been reached (and might not ever be).
  2931         case URX_START_CAPTURE:
  2932             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  2933             fp->fExtra[opValue+2] = fp->fInputIdx;
  2934             break;
  2937         case URX_END_CAPTURE:
  2938             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  2939             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
  2940             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
  2941             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
  2942             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
  2943             break;
  2946         case URX_DOLLAR:                   //  $, test for End of line
  2947                                            //     or for position before new line at end of input
  2949                 if (fp->fInputIdx >= fAnchorLimit) {
  2950                     // We really are at the end of input.  Success.
  2951                     fHitEnd = TRUE;
  2952                     fRequireEnd = TRUE;
  2953                     break;
  2956                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  2958                 // If we are positioned just before a new-line that is located at the
  2959                 //   end of input, succeed.
  2960                 UChar32 c = UTEXT_NEXT32(fInputText);
  2961                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
  2962                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
  2963                         // If not in the middle of a CR/LF sequence
  2964                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
  2965                             // At new-line at end of input. Success
  2966                             fHitEnd = TRUE;
  2967                             fRequireEnd = TRUE;
  2969                             break;
  2972                 } else {
  2973                     UChar32 nextC = UTEXT_NEXT32(fInputText);
  2974                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
  2975                         fHitEnd = TRUE;
  2976                         fRequireEnd = TRUE;
  2977                         break;                         // At CR/LF at end of input.  Success
  2981                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  2983             break;
  2986          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
  2987             if (fp->fInputIdx >= fAnchorLimit) {
  2988                 // Off the end of input.  Success.
  2989                 fHitEnd = TRUE;
  2990                 fRequireEnd = TRUE;
  2991                 break;
  2992             } else {
  2993                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  2994                 UChar32 c = UTEXT_NEXT32(fInputText);
  2995                 // Either at the last character of input, or off the end.
  2996                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
  2997                     fHitEnd = TRUE;
  2998                     fRequireEnd = TRUE;
  2999                     break;
  3003             // Not at end of input.  Back-track out.
  3004             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3005             break;
  3008          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
  3010                  if (fp->fInputIdx >= fAnchorLimit) {
  3011                      // We really are at the end of input.  Success.
  3012                      fHitEnd = TRUE;
  3013                      fRequireEnd = TRUE;
  3014                      break;
  3016                  // If we are positioned just before a new-line, succeed.
  3017                  // It makes no difference where the new-line is within the input.
  3018                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3019                  UChar32 c = UTEXT_CURRENT32(fInputText);
  3020                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
  3021                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
  3022                      //  In multi-line mode, hitting a new-line just before the end of input does not
  3023                      //   set the hitEnd or requireEnd flags
  3024                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
  3025                         break;
  3028                  // not at a new line.  Fail.
  3029                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3031              break;
  3034          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
  3036                  if (fp->fInputIdx >= fAnchorLimit) {
  3037                      // We really are at the end of input.  Success.
  3038                      fHitEnd = TRUE;
  3039                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
  3040                      break;               //   adding a new-line would not lose the match.
  3042                  // If we are not positioned just before a new-line, the test fails; backtrack out.
  3043                  // It makes no difference where the new-line is within the input.
  3044                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3045                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
  3046                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3049              break;
  3052        case URX_CARET:                    //  ^, test for start of line
  3053             if (fp->fInputIdx != fAnchorStart) {
  3054                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3056             break;
  3059        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
  3061                if (fp->fInputIdx == fAnchorStart) {
  3062                    // We are at the start input.  Success.
  3063                    break;
  3065                // Check whether character just before the current pos is a new-line
  3066                //   unless we are at the end of input
  3067                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3068                UChar32  c = UTEXT_PREVIOUS32(fInputText); 
  3069                if ((fp->fInputIdx < fAnchorLimit) && 
  3070                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  3071                    //  It's a new-line.  ^ is true.  Success.
  3072                    //  TODO:  what should be done with positions between a CR and LF?
  3073                    break;
  3075                // Not at the start of a line.  Fail.
  3076                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3078            break;
  3081        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
  3083                U_ASSERT(fp->fInputIdx >= fAnchorStart);
  3084                if (fp->fInputIdx <= fAnchorStart) {
  3085                    // We are at the start input.  Success.
  3086                    break;
  3088                // Check whether character just before the current pos is a new-line
  3089                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
  3090                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3091                UChar32  c = UTEXT_PREVIOUS32(fInputText);
  3092                if (c != 0x0a) {
  3093                    // Not at the start of a line.  Back-track out.
  3094                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3097            break;
  3099         case URX_BACKSLASH_B:          // Test for word boundaries
  3101                 UBool success = isWordBoundary(fp->fInputIdx);
  3102                 success ^= (UBool)(opValue != 0);     // flip sense for \B
  3103                 if (!success) {
  3104                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3107             break;
  3110         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
  3112                 UBool success = isUWordBoundary(fp->fInputIdx);
  3113                 success ^= (UBool)(opValue != 0);     // flip sense for \B
  3114                 if (!success) {
  3115                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3118             break;
  3121         case URX_BACKSLASH_D:            // Test for decimal digit
  3123                 if (fp->fInputIdx >= fActiveLimit) {
  3124                     fHitEnd = TRUE;
  3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3126                     break;
  3129                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3131                 UChar32 c = UTEXT_NEXT32(fInputText);
  3132                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
  3133                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
  3134                 success ^= (UBool)(opValue != 0);        // flip sense for \D
  3135                 if (success) {
  3136                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3137                 } else {
  3138                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3141             break;
  3144         case URX_BACKSLASH_G:          // Test for position at end of previous match
  3145             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
  3146                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3148             break;
  3151         case URX_BACKSLASH_X:     
  3152             //  Match a Grapheme, as defined by Unicode TR 29.
  3153             //  Differs slightly from Perl, which consumes combining marks independently
  3154             //    of context.
  3157                 // Fail if at end of input
  3158                 if (fp->fInputIdx >= fActiveLimit) {
  3159                     fHitEnd = TRUE;
  3160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3161                     break;
  3164                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3166                 // Examine (and consume) the current char.
  3167                 //   Dispatch into a little state machine, based on the char.
  3168                 UChar32  c;
  3169                 c = UTEXT_NEXT32(fInputText);
  3170                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3171                 UnicodeSet **sets = fPattern->fStaticSets;
  3172                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
  3173                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
  3174                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
  3175                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  3176                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  3177                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
  3178                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
  3179                 goto GC_Extend;
  3183 GC_L:
  3184                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  3185                 c = UTEXT_NEXT32(fInputText);
  3186                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3187                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
  3188                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  3189                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  3190                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
  3191                 (void)UTEXT_PREVIOUS32(fInputText);
  3192                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3193                 goto GC_Extend;
  3195 GC_V:
  3196                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  3197                 c = UTEXT_NEXT32(fInputText);
  3198                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3199                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
  3200                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
  3201                 (void)UTEXT_PREVIOUS32(fInputText);
  3202                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3203                 goto GC_Extend;
  3205 GC_T:
  3206                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  3207                 c = UTEXT_NEXT32(fInputText);
  3208                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3209                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
  3210                 (void)UTEXT_PREVIOUS32(fInputText);
  3211                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3212                 goto GC_Extend;
  3214 GC_Extend:
  3215                 // Combining characters are consumed here
  3216                 for (;;) {
  3217                     if (fp->fInputIdx >= fActiveLimit) {
  3218                         break;
  3220                     c = UTEXT_CURRENT32(fInputText);
  3221                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
  3222                         break;
  3224                     (void)UTEXT_NEXT32(fInputText);
  3225                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3227                 goto GC_Done;
  3229 GC_Control:
  3230                 // Most control chars stand alone (don't combine with combining chars),  
  3231                 //   except for that CR/LF sequence is a single grapheme cluster.
  3232                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
  3233                     c = UTEXT_NEXT32(fInputText);
  3234                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3237 GC_Done:
  3238                 if (fp->fInputIdx >= fActiveLimit) {
  3239                     fHitEnd = TRUE;
  3241                 break;
  3247         case URX_BACKSLASH_Z:          // Test for end of Input
  3248             if (fp->fInputIdx < fAnchorLimit) {
  3249                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3250             } else {
  3251                 fHitEnd = TRUE;
  3252                 fRequireEnd = TRUE;
  3254             break;
  3258         case URX_STATIC_SETREF:
  3260                 // Test input character against one of the predefined sets
  3261                 //    (Word Characters, for example)
  3262                 // The high bit of the op value is a flag for the match polarity.
  3263                 //    0:   success if input char is in set.
  3264                 //    1:   success if input char is not in set.
  3265                 if (fp->fInputIdx >= fActiveLimit) {
  3266                     fHitEnd = TRUE;
  3267                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3268                     break;
  3271                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);  
  3272                 opValue &= ~URX_NEG_SET;
  3273                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  3275                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3276                 UChar32 c = UTEXT_NEXT32(fInputText);
  3277                 if (c < 256) {
  3278                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  3279                     if (s8->contains(c)) {
  3280                         success = !success;
  3282                 } else {
  3283                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
  3284                     if (s->contains(c)) {
  3285                         success = !success;
  3288                 if (success) {
  3289                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3290                 } else {
  3291                     // the character wasn't in the set.
  3292                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3295             break;
  3298         case URX_STAT_SETREF_N:
  3300                 // Test input character for NOT being a member of  one of 
  3301                 //    the predefined sets (Word Characters, for example)
  3302                 if (fp->fInputIdx >= fActiveLimit) {
  3303                     fHitEnd = TRUE;
  3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3305                     break;
  3308                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  3310                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3312                 UChar32 c = UTEXT_NEXT32(fInputText);
  3313                 if (c < 256) {
  3314                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  3315                     if (s8->contains(c) == FALSE) {
  3316                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3317                         break;
  3319                 } else {
  3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
  3321                     if (s->contains(c) == FALSE) {
  3322                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3323                         break;
  3326                 // the character wasn't in the set.
  3327                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3329             break;
  3332         case URX_SETREF:
  3333             if (fp->fInputIdx >= fActiveLimit) {
  3334                 fHitEnd = TRUE;
  3335                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3336                 break;
  3337             } else {
  3338                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3340                 // There is input left.  Pick up one char and test it for set membership.
  3341                 UChar32 c = UTEXT_NEXT32(fInputText);
  3342                 U_ASSERT(opValue > 0 && opValue < sets->size());
  3343                 if (c<256) {
  3344                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  3345                     if (s8->contains(c)) {
  3346                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3347                         break;
  3349                 } else {
  3350                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
  3351                     if (s->contains(c)) {
  3352                         // The character is in the set.  A Match.
  3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3354                         break;
  3358                 // the character wasn't in the set.
  3359                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3361             break;
  3364         case URX_DOTANY:
  3366                 // . matches anything, but stops at end-of-line.
  3367                 if (fp->fInputIdx >= fActiveLimit) {
  3368                     // At end of input.  Match failed.  Backtrack out.
  3369                     fHitEnd = TRUE;
  3370                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3371                     break;
  3374                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3376                 // There is input left.  Advance over one char, unless we've hit end-of-line
  3377                 UChar32 c = UTEXT_NEXT32(fInputText);
  3378                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  3379                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  3380                     // End of line in normal mode.   . does not match.
  3381                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3382                     break;
  3384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3386             break;
  3389         case URX_DOTANY_ALL:
  3391                 // ., in dot-matches-all (including new lines) mode
  3392                 if (fp->fInputIdx >= fActiveLimit) {
  3393                     // At end of input.  Match failed.  Backtrack out.
  3394                     fHitEnd = TRUE;
  3395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3396                     break;
  3399                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3401                 // There is input left.  Advance over one char, except if we are
  3402                 //   at a cr/lf, advance over both of them.
  3403                 UChar32 c; 
  3404                 c = UTEXT_NEXT32(fInputText);
  3405                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3406                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
  3407                     // In the case of a CR/LF, we need to advance over both.
  3408                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
  3409                     if (nextc == 0x0a) {
  3410                         (void)UTEXT_NEXT32(fInputText);
  3411                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3415             break;
  3418         case URX_DOTANY_UNIX:
  3420                 // '.' operator, matches all, but stops at end-of-line.
  3421                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
  3422                 if (fp->fInputIdx >= fActiveLimit) {
  3423                     // At end of input.  Match failed.  Backtrack out.
  3424                     fHitEnd = TRUE;
  3425                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3426                     break;
  3429                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3431                 // There is input left.  Advance over one char, unless we've hit end-of-line
  3432                 UChar32 c = UTEXT_NEXT32(fInputText);
  3433                 if (c == 0x0a) {
  3434                     // End of line in normal mode.   '.' does not match the \n
  3435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3436                 } else {
  3437                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3440             break;
  3443         case URX_JMP:
  3444             fp->fPatIdx = opValue;
  3445             break;
  3447         case URX_FAIL:
  3448             isMatch = FALSE;
  3449             goto breakFromLoop;
  3451         case URX_JMP_SAV:
  3452             U_ASSERT(opValue < fPattern->fCompiledPat->size());
  3453             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
  3454             fp->fPatIdx = opValue;                         // Then JMP.
  3455             break;
  3457         case URX_JMP_SAV_X:
  3458             // This opcode is used with (x)+, when x can match a zero length string.
  3459             // Same as JMP_SAV, except conditional on the match having made forward progress.
  3460             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
  3461             //   data address of the input position at the start of the loop.
  3463                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
  3464                 int32_t  stoOp = (int32_t)pat[opValue-1];
  3465                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
  3466                 int32_t  frameLoc = URX_VAL(stoOp);
  3467                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
  3468                 int64_t prevInputIdx = fp->fExtra[frameLoc];
  3469                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
  3470                 if (prevInputIdx < fp->fInputIdx) {
  3471                     // The match did make progress.  Repeat the loop.
  3472                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
  3473                     fp->fPatIdx = opValue;
  3474                     fp->fExtra[frameLoc] = fp->fInputIdx;
  3476                 // If the input position did not advance, we do nothing here,
  3477                 //   execution will fall out of the loop.
  3479             break;
  3481         case URX_CTR_INIT:
  3483                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  3484                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  3486                 // Pick up the three extra operands that CTR_INIT has, and
  3487                 //    skip the pattern location counter past 
  3488                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  3489                 fp->fPatIdx += 3;
  3490                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  3491                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  3492                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  3493                 U_ASSERT(minCount>=0);
  3494                 U_ASSERT(maxCount>=minCount || maxCount==-1);
  3495                 U_ASSERT(loopLoc>=fp->fPatIdx);
  3497                 if (minCount == 0) {
  3498                     fp = StateSave(fp, loopLoc+1, status);
  3500                 if (maxCount == -1) {
  3501                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
  3502                 } else if (maxCount == 0) {
  3503                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3506             break;
  3508         case URX_CTR_LOOP:
  3510                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  3511                 int32_t initOp = (int32_t)pat[opValue];
  3512                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
  3513                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  3514                 int32_t minCount  = (int32_t)pat[opValue+2];
  3515                 int32_t maxCount  = (int32_t)pat[opValue+3];
  3516                 (*pCounter)++;
  3517                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  3518                     U_ASSERT(*pCounter == maxCount);
  3519                     break;
  3521                 if (*pCounter >= minCount) {
  3522                     if (maxCount == -1) {
  3523                         // Loop has no hard upper bound.
  3524                         // Check that it is progressing through the input, break if it is not.
  3525                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  3526                         if (fp->fInputIdx == *pLastInputIdx) {
  3527                             break;
  3528                         } else {
  3529                             *pLastInputIdx = fp->fInputIdx;
  3532                     fp = StateSave(fp, fp->fPatIdx, status);
  3534                 fp->fPatIdx = opValue + 4;    // Loop back.
  3536             break;
  3538         case URX_CTR_INIT_NG:
  3540                 // Initialize a non-greedy loop
  3541                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  3542                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  3544                 // Pick up the three extra operands that CTR_INIT_NG has, and
  3545                 //    skip the pattern location counter past 
  3546                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  3547                 fp->fPatIdx += 3;
  3548                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  3549                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  3550                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  3551                 U_ASSERT(minCount>=0);
  3552                 U_ASSERT(maxCount>=minCount || maxCount==-1);
  3553                 U_ASSERT(loopLoc>fp->fPatIdx);
  3554                 if (maxCount == -1) {
  3555                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
  3558                 if (minCount == 0) {
  3559                     if (maxCount != 0) {
  3560                         fp = StateSave(fp, fp->fPatIdx, status);
  3562                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
  3565             break;
  3567         case URX_CTR_LOOP_NG:
  3569                 // Non-greedy {min, max} loops
  3570                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  3571                 int32_t initOp = (int32_t)pat[opValue];
  3572                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
  3573                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  3574                 int32_t minCount  = (int32_t)pat[opValue+2];
  3575                 int32_t maxCount  = (int32_t)pat[opValue+3];
  3577                 (*pCounter)++;
  3578                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  3579                     // The loop has matched the maximum permitted number of times.
  3580                     //   Break out of here with no action.  Matching will
  3581                     //   continue with the following pattern.
  3582                     U_ASSERT(*pCounter == maxCount);
  3583                     break;
  3586                 if (*pCounter < minCount) {
  3587                     // We haven't met the minimum number of matches yet.
  3588                     //   Loop back for another one.
  3589                     fp->fPatIdx = opValue + 4;    // Loop back.
  3590                 } else {
  3591                     // We do have the minimum number of matches.
  3593                     // If there is no upper bound on the loop iterations, check that the input index
  3594                     // is progressing, and stop the loop if it is not.
  3595                     if (maxCount == -1) {
  3596                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  3597                         if (fp->fInputIdx == *pLastInputIdx) {
  3598                             break;
  3600                         *pLastInputIdx = fp->fInputIdx;
  3603                     // Loop Continuation: we will fall into the pattern following the loop
  3604                     //   (non-greedy, don't execute loop body first), but first do
  3605                     //   a state save to the top of the loop, so that a match failure
  3606                     //   in the following pattern will try another iteration of the loop.
  3607                     fp = StateSave(fp, opValue + 4, status);
  3610             break;
  3612         case URX_STO_SP:
  3613             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  3614             fData[opValue] = fStack->size();
  3615             break;
  3617         case URX_LD_SP:
  3619                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  3620                 int32_t newStackSize = (int32_t)fData[opValue];
  3621                 U_ASSERT(newStackSize <= fStack->size());
  3622                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  3623                 if (newFP == (int64_t *)fp) {
  3624                     break;
  3626                 int32_t i;
  3627                 for (i=0; i<fFrameSize; i++) {
  3628                     newFP[i] = ((int64_t *)fp)[i];
  3630                 fp = (REStackFrame *)newFP;
  3631                 fStack->setSize(newStackSize);
  3633             break;
  3635         case URX_BACKREF:
  3637                 U_ASSERT(opValue < fFrameSize);
  3638                 int64_t groupStartIdx = fp->fExtra[opValue];
  3639                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
  3640                 U_ASSERT(groupStartIdx <= groupEndIdx);
  3641                 if (groupStartIdx < 0) {
  3642                     // This capture group has not participated in the match thus far,
  3643                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  3644                     break;
  3646                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
  3647                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3649                 //   Note: if the capture group match was of an empty string the backref
  3650                 //         match succeeds.  Verified by testing:  Perl matches succeed 
  3651                 //         in this case, so we do too.
  3653                 UBool success = TRUE;
  3654                 for (;;) {
  3655                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
  3656                         success = TRUE;
  3657                         break;
  3659                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
  3660                         success = FALSE;
  3661                         fHitEnd = TRUE;
  3662                         break;
  3664                     UChar32 captureGroupChar = utext_next32(fAltInputText);
  3665                     UChar32 inputChar = utext_next32(fInputText);
  3666                     if (inputChar != captureGroupChar) {
  3667                         success = FALSE;
  3668                         break;
  3672                 if (success) {
  3673                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3674                 } else {
  3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3678             break;
  3682         case URX_BACKREF_I:
  3684                 U_ASSERT(opValue < fFrameSize);
  3685                 int64_t groupStartIdx = fp->fExtra[opValue];
  3686                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
  3687                 U_ASSERT(groupStartIdx <= groupEndIdx);
  3688                 if (groupStartIdx < 0) {
  3689                     // This capture group has not participated in the match thus far,
  3690                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  3691                     break;
  3693                 utext_setNativeIndex(fAltInputText, groupStartIdx);
  3694                 utext_setNativeIndex(fInputText, fp->fInputIdx);
  3695                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
  3696                 CaseFoldingUTextIterator inputItr(*fInputText);
  3698                 //   Note: if the capture group match was of an empty string the backref
  3699                 //         match succeeds.  Verified by testing:  Perl matches succeed 
  3700                 //         in this case, so we do too.
  3702                 UBool success = TRUE;
  3703                 for (;;) {
  3704                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
  3705                         success = TRUE;
  3706                         break;
  3708                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
  3709                         success = FALSE;
  3710                         fHitEnd = TRUE;
  3711                         break;
  3713                     UChar32 captureGroupChar = captureGroupItr.next();
  3714                     UChar32 inputChar = inputItr.next();
  3715                     if (inputChar != captureGroupChar) {
  3716                         success = FALSE;
  3717                         break;
  3721                 if (success && inputItr.inExpansion()) {
  3722                     // We otained a match by consuming part of a string obtained from 
  3723                     // case-folding a single code point of the input text.  
  3724                     // This does not count as an overall match.
  3725                     success = FALSE;
  3728                 if (success) {
  3729                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3730                 } else {
  3731                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3735             break;
  3737         case URX_STO_INP_LOC:
  3739                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
  3740                 fp->fExtra[opValue] = fp->fInputIdx;
  3742             break;
  3744         case URX_JMPX:
  3746                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  3747                 fp->fPatIdx += 1;
  3748                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
  3749                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
  3750                 int64_t savedInputIdx = fp->fExtra[dataLoc];
  3751                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
  3752                 if (savedInputIdx < fp->fInputIdx) {
  3753                     fp->fPatIdx = opValue;                               // JMP
  3754                 } else {
  3755                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
  3758             break;
  3760         case URX_LA_START:
  3762                 // Entering a lookahead block.
  3763                 // Save Stack Ptr, Input Pos.
  3764                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3765                 fData[opValue]   = fStack->size();
  3766                 fData[opValue+1] = fp->fInputIdx;
  3767                 fActiveStart     = fLookStart;          // Set the match region change for
  3768                 fActiveLimit     = fLookLimit;          //   transparent bounds.
  3770             break;
  3772         case URX_LA_END:
  3774                 // Leaving a look-ahead block.
  3775                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
  3776                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3777                 int32_t stackSize = fStack->size();
  3778                 int32_t newStackSize =(int32_t)fData[opValue];
  3779                 U_ASSERT(stackSize >= newStackSize);
  3780                 if (stackSize > newStackSize) {
  3781                     // Copy the current top frame back to the new (cut back) top frame.
  3782                     //   This makes the capture groups from within the look-ahead
  3783                     //   expression available.
  3784                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  3785                     int32_t i;
  3786                     for (i=0; i<fFrameSize; i++) {
  3787                         newFP[i] = ((int64_t *)fp)[i];
  3789                     fp = (REStackFrame *)newFP;
  3790                     fStack->setSize(newStackSize);
  3792                 fp->fInputIdx = fData[opValue+1];
  3794                 // Restore the active region bounds in the input string; they may have
  3795                 //    been changed because of transparent bounds on a Region.
  3796                 fActiveStart = fRegionStart;
  3797                 fActiveLimit = fRegionLimit;
  3799             break;
  3801         case URX_ONECHAR_I:
  3802             // Case insensitive one char.  The char from the pattern is already case folded.
  3803             // Input text is not, but case folding the input can not reduce two or more code
  3804             // points to one.
  3805             if (fp->fInputIdx < fActiveLimit) {
  3806                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3808                 UChar32 c = UTEXT_NEXT32(fInputText);
  3809                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
  3810                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3811                     break;
  3813             } else {
  3814                 fHitEnd = TRUE;
  3817             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3818             break;
  3820         case URX_STRING_I:
  3822                 // Case-insensitive test input against a literal string.
  3823                 // Strings require two slots in the compiled pattern, one for the
  3824                 //   offset to the string text, and one for the length.
  3825                 //   The compiled string has already been case folded.
  3827                     const UChar *patternString = litText + opValue;
  3828                     int32_t      patternStringIdx  = 0;
  3830                     op      = (int32_t)pat[fp->fPatIdx];
  3831                     fp->fPatIdx++;
  3832                     opType  = URX_TYPE(op);
  3833                     opValue = URX_VAL(op);
  3834                     U_ASSERT(opType == URX_STRING_LEN);
  3835                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
  3838                     UChar32   cPattern;
  3839                     UChar32   cText;
  3840                     UBool     success = TRUE;
  3842                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  3843                     CaseFoldingUTextIterator inputIterator(*fInputText);
  3844                     while (patternStringIdx < patternStringLen) {
  3845                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
  3846                             success = FALSE;
  3847                             fHitEnd = TRUE;
  3848                             break;
  3850                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
  3851                         cText = inputIterator.next();
  3852                         if (cText != cPattern) {
  3853                             success = FALSE;
  3854                             break;
  3857                     if (inputIterator.inExpansion()) {
  3858                         success = FALSE;
  3861                     if (success) {
  3862                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3863                     } else {
  3864                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3868             break;
  3870         case URX_LB_START:
  3872                 // Entering a look-behind block.
  3873                 // Save Stack Ptr, Input Pos.
  3874                 //   TODO:  implement transparent bounds.  Ticket #6067
  3875                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3876                 fData[opValue]   = fStack->size();
  3877                 fData[opValue+1] = fp->fInputIdx;
  3878                 // Init the variable containing the start index for attempted matches.
  3879                 fData[opValue+2] = -1;
  3880                 // Save input string length, then reset to pin any matches to end at
  3881                 //   the current position.
  3882                 fData[opValue+3] = fActiveLimit;
  3883                 fActiveLimit     = fp->fInputIdx;
  3885             break;
  3888         case URX_LB_CONT:
  3890                 // Positive Look-Behind, at top of loop checking for matches of LB expression
  3891                 //    at all possible input starting positions.
  3893                 // Fetch the min and max possible match lengths.  They are the operands
  3894                 //   of this op in the pattern.
  3895                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
  3896                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
  3897                 U_ASSERT(minML <= maxML);
  3898                 U_ASSERT(minML >= 0);
  3900                 // Fetch (from data) the last input index where a match was attempted.
  3901                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3902                 int64_t  *lbStartIdx = &fData[opValue+2];
  3903                 if (*lbStartIdx < 0) {
  3904                     // First time through loop.
  3905                     *lbStartIdx = fp->fInputIdx - minML;
  3906                 } else {
  3907                     // 2nd through nth time through the loop.
  3908                     // Back up start position for match by one.
  3909                     if (*lbStartIdx == 0) {
  3910                         (*lbStartIdx)--;
  3911                     } else {
  3912                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
  3913                         (void)UTEXT_PREVIOUS32(fInputText);
  3914                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3918                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  3919                     // We have tried all potential match starting points without
  3920                     //  getting a match.  Backtrack out, and out of the
  3921                     //   Look Behind altogether.
  3922                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3923                     int64_t restoreInputLen = fData[opValue+3];
  3924                     U_ASSERT(restoreInputLen >= fActiveLimit);
  3925                     U_ASSERT(restoreInputLen <= fInputLength);
  3926                     fActiveLimit = restoreInputLen;
  3927                     break;
  3930                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  3931                 //      (successful match will fall off the end of the loop.)
  3932                 fp = StateSave(fp, fp->fPatIdx-3, status);
  3933                 fp->fInputIdx = *lbStartIdx;
  3935             break;
  3937         case URX_LB_END:
  3938             // End of a look-behind block, after a successful match.
  3940                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3941                 if (fp->fInputIdx != fActiveLimit) {
  3942                     //  The look-behind expression matched, but the match did not
  3943                     //    extend all the way to the point that we are looking behind from.
  3944                     //  FAIL out of here, which will take us back to the LB_CONT, which
  3945                     //     will retry the match starting at another position or fail
  3946                     //     the look-behind altogether, whichever is appropriate.
  3947                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  3948                     break;
  3951                 // Look-behind match is good.  Restore the orignal input string length,
  3952                 //   which had been truncated to pin the end of the lookbehind match to the 
  3953                 //   position being looked-behind.
  3954                 int64_t originalInputLen = fData[opValue+3];
  3955                 U_ASSERT(originalInputLen >= fActiveLimit);
  3956                 U_ASSERT(originalInputLen <= fInputLength);
  3957                 fActiveLimit = originalInputLen;
  3959             break;
  3962         case URX_LBN_CONT:
  3964                 // Negative Look-Behind, at top of loop checking for matches of LB expression
  3965                 //    at all possible input starting positions.
  3967                 // Fetch the extra parameters of this op.
  3968                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
  3969                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
  3970                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
  3971                         continueLoc = URX_VAL(continueLoc);
  3972                 U_ASSERT(minML <= maxML);
  3973                 U_ASSERT(minML >= 0);
  3974                 U_ASSERT(continueLoc > fp->fPatIdx);
  3976                 // Fetch (from data) the last input index where a match was attempted.
  3977                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  3978                 int64_t  *lbStartIdx = &fData[opValue+2];
  3979                 if (*lbStartIdx < 0) {
  3980                     // First time through loop.
  3981                     *lbStartIdx = fp->fInputIdx - minML;
  3982                 } else {
  3983                     // 2nd through nth time through the loop.
  3984                     // Back up start position for match by one.
  3985                     if (*lbStartIdx == 0) {
  3986                         (*lbStartIdx)--;
  3987                     } else {
  3988                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
  3989                         (void)UTEXT_PREVIOUS32(fInputText);
  3990                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
  3994                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  3995                     // We have tried all potential match starting points without
  3996                     //  getting a match, which means that the negative lookbehind as
  3997                     //  a whole has succeeded.  Jump forward to the continue location
  3998                     int64_t restoreInputLen = fData[opValue+3];
  3999                     U_ASSERT(restoreInputLen >= fActiveLimit);
  4000                     U_ASSERT(restoreInputLen <= fInputLength);
  4001                     fActiveLimit = restoreInputLen;
  4002                     fp->fPatIdx = continueLoc;
  4003                     break;
  4006                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  4007                 //      (successful match will cause a FAIL out of the loop altogether.)
  4008                 fp = StateSave(fp, fp->fPatIdx-4, status);
  4009                 fp->fInputIdx = *lbStartIdx;
  4011             break;
  4013         case URX_LBN_END:
  4014             // End of a negative look-behind block, after a successful match.
  4016                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  4017                 if (fp->fInputIdx != fActiveLimit) {
  4018                     //  The look-behind expression matched, but the match did not
  4019                     //    extend all the way to the point that we are looking behind from.
  4020                     //  FAIL out of here, which will take us back to the LB_CONT, which
  4021                     //     will retry the match starting at another position or succeed
  4022                     //     the look-behind altogether, whichever is appropriate.
  4023                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4024                     break;
  4027                 // Look-behind expression matched, which means look-behind test as
  4028                 //   a whole Fails
  4030                 //   Restore the orignal input string length, which had been truncated 
  4031                 //   inorder to pin the end of the lookbehind match  
  4032                 //   to the position being looked-behind.
  4033                 int64_t originalInputLen = fData[opValue+3];
  4034                 U_ASSERT(originalInputLen >= fActiveLimit);
  4035                 U_ASSERT(originalInputLen <= fInputLength);
  4036                 fActiveLimit = originalInputLen;
  4038                 // Restore original stack position, discarding any state saved
  4039                 //   by the successful pattern match.
  4040                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  4041                 int32_t newStackSize = (int32_t)fData[opValue];
  4042                 U_ASSERT(fStack->size() > newStackSize);
  4043                 fStack->setSize(newStackSize);
  4045                 //  FAIL, which will take control back to someplace 
  4046                 //  prior to entering the look-behind test.
  4047                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4049             break;
  4052         case URX_LOOP_SR_I:
  4053             // Loop Initialization for the optimized implementation of
  4054             //     [some character set]*
  4055             //   This op scans through all matching input.
  4056             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  4058                 U_ASSERT(opValue > 0 && opValue < sets->size());
  4059                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  4060                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
  4062                 // Loop through input, until either the input is exhausted or
  4063                 //   we reach a character that is not a member of the set.
  4064                 int64_t ix = fp->fInputIdx;
  4065                 UTEXT_SETNATIVEINDEX(fInputText, ix);
  4066                 for (;;) {
  4067                     if (ix >= fActiveLimit) {
  4068                         fHitEnd = TRUE;
  4069                         break;
  4071                     UChar32 c = UTEXT_NEXT32(fInputText);
  4072                     if (c<256) {
  4073                         if (s8->contains(c) == FALSE) {
  4074                             break;
  4076                     } else {
  4077                         if (s->contains(c) == FALSE) {
  4078                             break;
  4081                     ix = UTEXT_GETNATIVEINDEX(fInputText);
  4084                 // If there were no matching characters, skip over the loop altogether.
  4085                 //   The loop doesn't run at all, a * op always succeeds.
  4086                 if (ix == fp->fInputIdx) {
  4087                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
  4088                     break;
  4091                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  4092                 //   must follow.  It's operand is the stack location
  4093                 //   that holds the starting input index for the match of this [set]*
  4094                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  4095                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  4096                 int32_t stackLoc = URX_VAL(loopcOp);
  4097                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  4098                 fp->fExtra[stackLoc] = fp->fInputIdx;
  4099                 fp->fInputIdx = ix;
  4101                 // Save State to the URX_LOOP_C op that follows this one,
  4102                 //   so that match failures in the following code will return to there.
  4103                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  4104                 fp = StateSave(fp, fp->fPatIdx, status);
  4105                 fp->fPatIdx++;
  4107             break;
  4110         case URX_LOOP_DOT_I:
  4111             // Loop Initialization for the optimized implementation of .*
  4112             //   This op scans through all remaining input.
  4113             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  4115                 // Loop through input until the input is exhausted (we reach an end-of-line)
  4116                 // In DOTALL mode, we can just go straight to the end of the input.
  4117                 int64_t ix;
  4118                 if ((opValue & 1) == 1) {
  4119                     // Dot-matches-All mode.  Jump straight to the end of the string.
  4120                     ix = fActiveLimit;
  4121                     fHitEnd = TRUE;
  4122                 } else {
  4123                     // NOT DOT ALL mode.  Line endings do not match '.'
  4124                     // Scan forward until a line ending or end of input.
  4125                     ix = fp->fInputIdx;
  4126                     UTEXT_SETNATIVEINDEX(fInputText, ix);
  4127                     for (;;) {
  4128                         if (ix >= fActiveLimit) {
  4129                             fHitEnd = TRUE;
  4130                             break;
  4132                         UChar32 c = UTEXT_NEXT32(fInputText);
  4133                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
  4134                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
  4135                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
  4136                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
  4137                                 //  char is a line ending.  Exit the scanning loop.
  4138                                 break;
  4141                         ix = UTEXT_GETNATIVEINDEX(fInputText);
  4145                 // If there were no matching characters, skip over the loop altogether.
  4146                 //   The loop doesn't run at all, a * op always succeeds.
  4147                 if (ix == fp->fInputIdx) {
  4148                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
  4149                     break;
  4152                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  4153                 //   must follow.  It's operand is the stack location
  4154                 //   that holds the starting input index for the match of this .*
  4155                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  4156                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  4157                 int32_t stackLoc = URX_VAL(loopcOp);
  4158                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  4159                 fp->fExtra[stackLoc] = fp->fInputIdx;
  4160                 fp->fInputIdx = ix;
  4162                 // Save State to the URX_LOOP_C op that follows this one,
  4163                 //   so that match failures in the following code will return to there.
  4164                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  4165                 fp = StateSave(fp, fp->fPatIdx, status);
  4166                 fp->fPatIdx++;
  4168             break;
  4171         case URX_LOOP_C:
  4173                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
  4174                 backSearchIndex = fp->fExtra[opValue];
  4175                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
  4176                 if (backSearchIndex == fp->fInputIdx) {
  4177                     // We've backed up the input idx to the point that the loop started.
  4178                     // The loop is done.  Leave here without saving state.   
  4179                     //  Subsequent failures won't come back here.
  4180                     break;
  4182                 // Set up for the next iteration of the loop, with input index
  4183                 //   backed up by one from the last time through,
  4184                 //   and a state save to this instruction in case the following code fails again.
  4185                 //   (We're going backwards because this loop emulates stack unwinding, not
  4186                 //    the initial scan forward.)
  4187                 U_ASSERT(fp->fInputIdx > 0);
  4188                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  4189                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
  4190                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  4192                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
  4193                 if (prevC == 0x0a && 
  4194                     fp->fInputIdx > backSearchIndex &&
  4195                     twoPrevC == 0x0d) {
  4196                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
  4197                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
  4198                         // .*, stepping back over CRLF pair.
  4199                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  4204                 fp = StateSave(fp, fp->fPatIdx-1, status);
  4206             break;
  4210         default:
  4211             // Trouble.  The compiled pattern contains an entry with an
  4212             //           unrecognized type tag.
  4213             U_ASSERT(FALSE);
  4216         if (U_FAILURE(status)) {
  4217             isMatch = FALSE;
  4218             break;
  4222 breakFromLoop:
  4223     fMatch = isMatch;
  4224     if (isMatch) {
  4225         fLastMatchEnd = fMatchEnd;
  4226         fMatchStart   = startIdx;
  4227         fMatchEnd     = fp->fInputIdx;
  4228         if (fTraceDebug) {
  4229             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
  4232     else
  4234         if (fTraceDebug) {
  4235             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
  4239     fFrame = fp;                // The active stack frame when the engine stopped.
  4240                                 //   Contains the capture group results that we need to
  4241                                 //    access later.
  4242     return;
  4246 //--------------------------------------------------------------------------------
  4247 //
  4248 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
  4249 //                  assumption that the entire string is available in the UText's
  4250 //                  chunk buffer. For now, that means we can use int32_t indexes,
  4251 //                  except for anything that needs to be saved (like group starts
  4252 //                  and ends).
  4253 //
  4254 //                  startIdx:    begin matching a this index.
  4255 //                  toEnd:       if true, match must extend to end of the input region
  4256 //
  4257 //--------------------------------------------------------------------------------
  4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
  4259     UBool       isMatch  = FALSE;      // True if the we have a match.
  4261     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
  4263     int32_t     op;                    // Operation from the compiled pattern, split into
  4264     int32_t     opType;                //    the opcode
  4265     int32_t     opValue;               //    and the operand value.
  4267 #ifdef REGEX_RUN_DEBUG
  4268     if (fTraceDebug)
  4270         printf("MatchAt(startIdx=%d)\n", startIdx);
  4271         printf("Original Pattern: ");
  4272         UChar32 c = utext_next32From(fPattern->fPattern, 0);
  4273         while (c != U_SENTINEL) {
  4274             if (c<32 || c>256) {
  4275                 c = '.';
  4277             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
  4279             c = UTEXT_NEXT32(fPattern->fPattern);
  4281         printf("\n");
  4282         printf("Input String: ");
  4283         c = utext_next32From(fInputText, 0);
  4284         while (c != U_SENTINEL) {
  4285             if (c<32 || c>256) {
  4286                 c = '.';
  4288             printf("%c", c);
  4290             c = UTEXT_NEXT32(fInputText);
  4292         printf("\n");
  4293         printf("\n");
  4295 #endif
  4297     if (U_FAILURE(status)) {
  4298         return;
  4301     //  Cache frequently referenced items from the compiled pattern
  4302     //
  4303     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
  4305     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
  4306     UVector             *sets          = fPattern->fSets;
  4308     const UChar         *inputBuf      = fInputText->chunkContents;
  4310     fFrameSize = fPattern->fFrameSize;
  4311     REStackFrame        *fp            = resetStack();
  4313     fp->fPatIdx   = 0;
  4314     fp->fInputIdx = startIdx;
  4316     // Zero out the pattern's static data
  4317     int32_t i;
  4318     for (i = 0; i<fPattern->fDataSize; i++) {
  4319         fData[i] = 0;
  4322     //
  4323     //  Main loop for interpreting the compiled pattern.
  4324     //  One iteration of the loop per pattern operation performed.
  4325     //
  4326     for (;;) {
  4327 #if 0
  4328         if (_heapchk() != _HEAPOK) {
  4329             fprintf(stderr, "Heap Trouble\n");
  4331 #endif
  4333         op      = (int32_t)pat[fp->fPatIdx];
  4334         opType  = URX_TYPE(op);
  4335         opValue = URX_VAL(op);
  4336 #ifdef REGEX_RUN_DEBUG
  4337         if (fTraceDebug) {
  4338             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  4339             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
  4340                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
  4341             fPattern->dumpOp(fp->fPatIdx);
  4343 #endif
  4344         fp->fPatIdx++;
  4346         switch (opType) {
  4349         case URX_NOP:
  4350             break;
  4353         case URX_BACKTRACK:
  4354             // Force a backtrack.  In some circumstances, the pattern compiler
  4355             //   will notice that the pattern can't possibly match anything, and will
  4356             //   emit one of these at that point.
  4357             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4358             break;
  4361         case URX_ONECHAR:
  4362             if (fp->fInputIdx < fActiveLimit) {
  4363                 UChar32 c;
  4364                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4365                 if (c == opValue) {
  4366                     break;
  4368             } else {
  4369                 fHitEnd = TRUE;
  4371             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4372             break;
  4375         case URX_STRING:
  4377                 // Test input against a literal string.
  4378                 // Strings require two slots in the compiled pattern, one for the
  4379                 //   offset to the string text, and one for the length.
  4380                 int32_t   stringStartIdx = opValue;
  4381                 int32_t   stringLen;
  4383                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
  4384                 fp->fPatIdx++;
  4385                 opType    = URX_TYPE(op);
  4386                 stringLen = URX_VAL(op);
  4387                 U_ASSERT(opType == URX_STRING_LEN);
  4388                 U_ASSERT(stringLen >= 2);
  4390                 const UChar * pInp = inputBuf + fp->fInputIdx;
  4391                 const UChar * pInpLimit = inputBuf + fActiveLimit;
  4392                 const UChar * pPat = litText+stringStartIdx;
  4393                 const UChar * pEnd = pInp + stringLen;
  4394                 UBool success = TRUE;
  4395                 while (pInp < pEnd) {
  4396                     if (pInp >= pInpLimit) {
  4397                         fHitEnd = TRUE;
  4398                         success = FALSE;
  4399                         break;
  4401                     if (*pInp++ != *pPat++) {
  4402                         success = FALSE;
  4403                         break;
  4407                 if (success) {
  4408                     fp->fInputIdx += stringLen;
  4409                 } else {
  4410                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4413             break;
  4416         case URX_STATE_SAVE:
  4417             fp = StateSave(fp, opValue, status);
  4418             break;
  4421         case URX_END:
  4422             // The match loop will exit via this path on a successful match,
  4423             //   when we reach the end of the pattern.
  4424             if (toEnd && fp->fInputIdx != fActiveLimit) {
  4425                 // The pattern matched, but not to the end of input.  Try some more.
  4426                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4427                 break;
  4429             isMatch = TRUE;
  4430             goto  breakFromLoop;
  4432             // Start and End Capture stack frame variables are laid out out like this:
  4433             //  fp->fExtra[opValue]  - The start of a completed capture group
  4434             //             opValue+1 - The end   of a completed capture group
  4435             //             opValue+2 - the start of a capture group whose end
  4436             //                          has not yet been reached (and might not ever be).
  4437         case URX_START_CAPTURE:
  4438             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  4439             fp->fExtra[opValue+2] = fp->fInputIdx;
  4440             break;
  4443         case URX_END_CAPTURE:
  4444             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  4445             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
  4446             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
  4447             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
  4448             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
  4449             break;
  4452         case URX_DOLLAR:                   //  $, test for End of line
  4453             //     or for position before new line at end of input
  4454             if (fp->fInputIdx < fAnchorLimit-2) {
  4455                 // We are no where near the end of input.  Fail.
  4456                 //   This is the common case.  Keep it first.
  4457                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4458                 break;
  4460             if (fp->fInputIdx >= fAnchorLimit) {
  4461                 // We really are at the end of input.  Success.
  4462                 fHitEnd = TRUE;
  4463                 fRequireEnd = TRUE;
  4464                 break;
  4467             // If we are positioned just before a new-line that is located at the
  4468             //   end of input, succeed.
  4469             if (fp->fInputIdx == fAnchorLimit-1) {
  4470                 UChar32 c;
  4471                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
  4473                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
  4474                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
  4475                         // At new-line at end of input. Success
  4476                         fHitEnd = TRUE;
  4477                         fRequireEnd = TRUE;
  4478                         break;
  4481             } else if (fp->fInputIdx == fAnchorLimit-2 &&
  4482                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
  4483                     fHitEnd = TRUE;
  4484                     fRequireEnd = TRUE;
  4485                     break;                         // At CR/LF at end of input.  Success
  4488             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4490             break;
  4493         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
  4494             if (fp->fInputIdx >= fAnchorLimit-1) {
  4495                 // Either at the last character of input, or off the end.
  4496                 if (fp->fInputIdx == fAnchorLimit-1) {
  4497                     // At last char of input.  Success if it's a new line.
  4498                     if (inputBuf[fp->fInputIdx] == 0x0a) {
  4499                         fHitEnd = TRUE;
  4500                         fRequireEnd = TRUE;
  4501                         break;
  4503                 } else {
  4504                     // Off the end of input.  Success.
  4505                     fHitEnd = TRUE;
  4506                     fRequireEnd = TRUE;
  4507                     break;
  4511             // Not at end of input.  Back-track out.
  4512             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4513             break;
  4516         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
  4518                 if (fp->fInputIdx >= fAnchorLimit) {
  4519                     // We really are at the end of input.  Success.
  4520                     fHitEnd = TRUE;
  4521                     fRequireEnd = TRUE;
  4522                     break;
  4524                 // If we are positioned just before a new-line, succeed.
  4525                 // It makes no difference where the new-line is within the input.
  4526                 UChar32 c = inputBuf[fp->fInputIdx];
  4527                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
  4528                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
  4529                     //  In multi-line mode, hitting a new-line just before the end of input does not
  4530                     //   set the hitEnd or requireEnd flags
  4531                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
  4532                         break;
  4535                 // not at a new line.  Fail.
  4536                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4538             break;
  4541         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
  4543                 if (fp->fInputIdx >= fAnchorLimit) {
  4544                     // We really are at the end of input.  Success.
  4545                     fHitEnd = TRUE;
  4546                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
  4547                     break;               //   adding a new-line would not lose the match.
  4549                 // If we are not positioned just before a new-line, the test fails; backtrack out.
  4550                 // It makes no difference where the new-line is within the input.
  4551                 if (inputBuf[fp->fInputIdx] != 0x0a) {
  4552                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4555             break;
  4558         case URX_CARET:                    //  ^, test for start of line
  4559             if (fp->fInputIdx != fAnchorStart) {
  4560                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4562             break;
  4565         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
  4567                 if (fp->fInputIdx == fAnchorStart) {
  4568                     // We are at the start input.  Success.
  4569                     break;
  4571                 // Check whether character just before the current pos is a new-line
  4572                 //   unless we are at the end of input
  4573                 UChar  c = inputBuf[fp->fInputIdx - 1]; 
  4574                 if ((fp->fInputIdx < fAnchorLimit) && 
  4575                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  4576                     //  It's a new-line.  ^ is true.  Success.
  4577                     //  TODO:  what should be done with positions between a CR and LF?
  4578                     break;
  4580                 // Not at the start of a line.  Fail.
  4581                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4583             break;
  4586         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
  4588                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
  4589                 if (fp->fInputIdx <= fAnchorStart) {
  4590                     // We are at the start input.  Success.
  4591                     break;
  4593                 // Check whether character just before the current pos is a new-line
  4594                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
  4595                 UChar  c = inputBuf[fp->fInputIdx - 1]; 
  4596                 if (c != 0x0a) {
  4597                     // Not at the start of a line.  Back-track out.
  4598                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4601             break;
  4603         case URX_BACKSLASH_B:          // Test for word boundaries
  4605                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
  4606                 success ^= (UBool)(opValue != 0);     // flip sense for \B
  4607                 if (!success) {
  4608                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4611             break;
  4614         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
  4616                 UBool success = isUWordBoundary(fp->fInputIdx);
  4617                 success ^= (UBool)(opValue != 0);     // flip sense for \B
  4618                 if (!success) {
  4619                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4622             break;
  4625         case URX_BACKSLASH_D:            // Test for decimal digit
  4627                 if (fp->fInputIdx >= fActiveLimit) {
  4628                     fHitEnd = TRUE;
  4629                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4630                     break;
  4633                 UChar32 c;
  4634                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4635                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
  4636                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
  4637                 success ^= (UBool)(opValue != 0);        // flip sense for \D
  4638                 if (!success) {
  4639                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4642             break;
  4645         case URX_BACKSLASH_G:          // Test for position at end of previous match
  4646             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
  4647                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4649             break;
  4652         case URX_BACKSLASH_X:     
  4653         //  Match a Grapheme, as defined by Unicode TR 29.
  4654         //  Differs slightly from Perl, which consumes combining marks independently
  4655         //    of context.
  4658             // Fail if at end of input
  4659             if (fp->fInputIdx >= fActiveLimit) {
  4660                 fHitEnd = TRUE;
  4661                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4662                 break;
  4665             // Examine (and consume) the current char.
  4666             //   Dispatch into a little state machine, based on the char.
  4667             UChar32  c;
  4668             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4669             UnicodeSet **sets = fPattern->fStaticSets;
  4670             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
  4671             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
  4672             if (sets[URX_GC_L]->contains(c))       goto GC_L;
  4673             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  4674             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  4675             if (sets[URX_GC_V]->contains(c))       goto GC_V;
  4676             if (sets[URX_GC_T]->contains(c))       goto GC_T;
  4677             goto GC_Extend;
  4681 GC_L:
  4682             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  4683             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4684             if (sets[URX_GC_L]->contains(c))       goto GC_L;
  4685             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  4686             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  4687             if (sets[URX_GC_V]->contains(c))       goto GC_V;
  4688             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  4689             goto GC_Extend;
  4691 GC_V:
  4692             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  4693             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4694             if (sets[URX_GC_V]->contains(c))       goto GC_V;
  4695             if (sets[URX_GC_T]->contains(c))       goto GC_T;
  4696             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  4697             goto GC_Extend;
  4699 GC_T:
  4700             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  4701             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4702             if (sets[URX_GC_T]->contains(c))       goto GC_T;
  4703             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  4704             goto GC_Extend;
  4706 GC_Extend:
  4707             // Combining characters are consumed here
  4708             for (;;) {
  4709                 if (fp->fInputIdx >= fActiveLimit) {
  4710                     break;
  4712                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4713                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
  4714                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
  4715                     break;
  4718             goto GC_Done;
  4720 GC_Control:
  4721             // Most control chars stand alone (don't combine with combining chars),  
  4722             //   except for that CR/LF sequence is a single grapheme cluster.
  4723             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
  4724                 fp->fInputIdx++;
  4727 GC_Done:
  4728             if (fp->fInputIdx >= fActiveLimit) {
  4729                 fHitEnd = TRUE;
  4731             break;
  4737         case URX_BACKSLASH_Z:          // Test for end of Input
  4738             if (fp->fInputIdx < fAnchorLimit) {
  4739                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4740             } else {
  4741                 fHitEnd = TRUE;
  4742                 fRequireEnd = TRUE;
  4744             break;
  4748         case URX_STATIC_SETREF:
  4750                 // Test input character against one of the predefined sets
  4751                 //    (Word Characters, for example)
  4752                 // The high bit of the op value is a flag for the match polarity.
  4753                 //    0:   success if input char is in set.
  4754                 //    1:   success if input char is not in set.
  4755                 if (fp->fInputIdx >= fActiveLimit) {
  4756                     fHitEnd = TRUE;
  4757                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4758                     break;
  4761                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);  
  4762                 opValue &= ~URX_NEG_SET;
  4763                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  4765                 UChar32 c;
  4766                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4767                 if (c < 256) {
  4768                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  4769                     if (s8->contains(c)) {
  4770                         success = !success;
  4772                 } else {
  4773                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
  4774                     if (s->contains(c)) {
  4775                         success = !success;
  4778                 if (!success) {
  4779                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4782             break;
  4785         case URX_STAT_SETREF_N:
  4787                 // Test input character for NOT being a member of  one of 
  4788                 //    the predefined sets (Word Characters, for example)
  4789                 if (fp->fInputIdx >= fActiveLimit) {
  4790                     fHitEnd = TRUE;
  4791                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4792                     break;
  4795                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  4797                 UChar32  c;
  4798                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4799                 if (c < 256) {
  4800                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  4801                     if (s8->contains(c) == FALSE) {
  4802                         break;
  4804                 } else {
  4805                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
  4806                     if (s->contains(c) == FALSE) {
  4807                         break;
  4810                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4812             break;
  4815         case URX_SETREF:
  4817                 if (fp->fInputIdx >= fActiveLimit) {
  4818                     fHitEnd = TRUE;
  4819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4820                     break;
  4823                 U_ASSERT(opValue > 0 && opValue < sets->size());
  4825                 // There is input left.  Pick up one char and test it for set membership.
  4826                 UChar32  c;
  4827                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4828                 if (c<256) {
  4829                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  4830                     if (s8->contains(c)) {
  4831                         // The character is in the set.  A Match.
  4832                         break;
  4834                 } else {
  4835                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
  4836                     if (s->contains(c)) {
  4837                         // The character is in the set.  A Match.
  4838                         break;
  4842                 // the character wasn't in the set.
  4843                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4845             break;
  4848         case URX_DOTANY:
  4850                 // . matches anything, but stops at end-of-line.
  4851                 if (fp->fInputIdx >= fActiveLimit) {
  4852                     // At end of input.  Match failed.  Backtrack out.
  4853                     fHitEnd = TRUE;
  4854                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4855                     break;
  4858                 // There is input left.  Advance over one char, unless we've hit end-of-line
  4859                 UChar32  c;
  4860                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4861                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  4862                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  4863                     // End of line in normal mode.   . does not match.
  4864                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4865                     break;
  4868             break;
  4871         case URX_DOTANY_ALL:
  4873                 // . in dot-matches-all (including new lines) mode
  4874                 if (fp->fInputIdx >= fActiveLimit) {
  4875                     // At end of input.  Match failed.  Backtrack out.
  4876                     fHitEnd = TRUE;
  4877                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4878                     break;
  4881                 // There is input left.  Advance over one char, except if we are
  4882                 //   at a cr/lf, advance over both of them.
  4883                 UChar32 c; 
  4884                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4885                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
  4886                     // In the case of a CR/LF, we need to advance over both.
  4887                     if (inputBuf[fp->fInputIdx] == 0x0a) {
  4888                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
  4892             break;
  4895         case URX_DOTANY_UNIX:
  4897                 // '.' operator, matches all, but stops at end-of-line.
  4898                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
  4899                 if (fp->fInputIdx >= fActiveLimit) {
  4900                     // At end of input.  Match failed.  Backtrack out.
  4901                     fHitEnd = TRUE;
  4902                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4903                     break;
  4906                 // There is input left.  Advance over one char, unless we've hit end-of-line
  4907                 UChar32 c; 
  4908                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  4909                 if (c == 0x0a) {
  4910                     // End of line in normal mode.   '.' does not match the \n
  4911                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4914             break;
  4917         case URX_JMP:
  4918             fp->fPatIdx = opValue;
  4919             break;
  4921         case URX_FAIL:
  4922             isMatch = FALSE;
  4923             goto breakFromLoop;
  4925         case URX_JMP_SAV:
  4926             U_ASSERT(opValue < fPattern->fCompiledPat->size());
  4927             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
  4928             fp->fPatIdx = opValue;                         // Then JMP.
  4929             break;
  4931         case URX_JMP_SAV_X:
  4932             // This opcode is used with (x)+, when x can match a zero length string.
  4933             // Same as JMP_SAV, except conditional on the match having made forward progress.
  4934             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
  4935             //   data address of the input position at the start of the loop.
  4937                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
  4938                 int32_t  stoOp = (int32_t)pat[opValue-1];
  4939                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
  4940                 int32_t  frameLoc = URX_VAL(stoOp);
  4941                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
  4942                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
  4943                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
  4944                 if (prevInputIdx < fp->fInputIdx) {
  4945                     // The match did make progress.  Repeat the loop.
  4946                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
  4947                     fp->fPatIdx = opValue;
  4948                     fp->fExtra[frameLoc] = fp->fInputIdx;
  4950                 // If the input position did not advance, we do nothing here,
  4951                 //   execution will fall out of the loop.
  4953             break;
  4955         case URX_CTR_INIT:
  4957                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  4958                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  4960                 // Pick up the three extra operands that CTR_INIT has, and
  4961                 //    skip the pattern location counter past 
  4962                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  4963                 fp->fPatIdx += 3;
  4964                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  4965                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  4966                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  4967                 U_ASSERT(minCount>=0);
  4968                 U_ASSERT(maxCount>=minCount || maxCount==-1);
  4969                 U_ASSERT(loopLoc>=fp->fPatIdx);
  4971                 if (minCount == 0) {
  4972                     fp = StateSave(fp, loopLoc+1, status);
  4974                 if (maxCount == -1) {
  4975                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
  4976                 } else if (maxCount == 0) {
  4977                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  4980             break;
  4982         case URX_CTR_LOOP:
  4984                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  4985                 int32_t initOp = (int32_t)pat[opValue];
  4986                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
  4987                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  4988                 int32_t minCount  = (int32_t)pat[opValue+2];
  4989                 int32_t maxCount  = (int32_t)pat[opValue+3];
  4990                 (*pCounter)++;
  4991                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  4992                     U_ASSERT(*pCounter == maxCount);
  4993                     break;
  4995                 if (*pCounter >= minCount) {
  4996                     if (maxCount == -1) {
  4997                         // Loop has no hard upper bound.
  4998                         // Check that it is progressing through the input, break if it is not.
  4999                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  5000                         if (fp->fInputIdx == *pLastInputIdx) {
  5001                             break;
  5002                         } else {
  5003                             *pLastInputIdx = fp->fInputIdx;
  5006                     fp = StateSave(fp, fp->fPatIdx, status);
  5008                 fp->fPatIdx = opValue + 4;    // Loop back.
  5010             break;
  5012         case URX_CTR_INIT_NG:
  5014                 // Initialize a non-greedy loop
  5015                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  5016                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  5018                 // Pick up the three extra operands that CTR_INIT_NG has, and
  5019                 //    skip the pattern location counter past 
  5020                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  5021                 fp->fPatIdx += 3;
  5022                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  5023                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  5024                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  5025                 U_ASSERT(minCount>=0);
  5026                 U_ASSERT(maxCount>=minCount || maxCount==-1);
  5027                 U_ASSERT(loopLoc>fp->fPatIdx);
  5028                 if (maxCount == -1) {
  5029                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
  5032                 if (minCount == 0) {
  5033                     if (maxCount != 0) {
  5034                         fp = StateSave(fp, fp->fPatIdx, status);
  5036                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
  5039             break;
  5041         case URX_CTR_LOOP_NG:
  5043                 // Non-greedy {min, max} loops
  5044                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  5045                 int32_t initOp = (int32_t)pat[opValue];
  5046                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
  5047                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  5048                 int32_t minCount  = (int32_t)pat[opValue+2];
  5049                 int32_t maxCount  = (int32_t)pat[opValue+3];
  5051                 (*pCounter)++;
  5052                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  5053                     // The loop has matched the maximum permitted number of times.
  5054                     //   Break out of here with no action.  Matching will
  5055                     //   continue with the following pattern.
  5056                     U_ASSERT(*pCounter == maxCount);
  5057                     break;
  5060                 if (*pCounter < minCount) {
  5061                     // We haven't met the minimum number of matches yet.
  5062                     //   Loop back for another one.
  5063                     fp->fPatIdx = opValue + 4;    // Loop back.
  5064                 } else {
  5065                     // We do have the minimum number of matches.
  5067                     // If there is no upper bound on the loop iterations, check that the input index
  5068                     // is progressing, and stop the loop if it is not.
  5069                     if (maxCount == -1) {
  5070                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  5071                         if (fp->fInputIdx == *pLastInputIdx) {
  5072                             break;
  5074                         *pLastInputIdx = fp->fInputIdx;
  5077                     // Loop Continuation: we will fall into the pattern following the loop
  5078                     //   (non-greedy, don't execute loop body first), but first do
  5079                     //   a state save to the top of the loop, so that a match failure
  5080                     //   in the following pattern will try another iteration of the loop.
  5081                     fp = StateSave(fp, opValue + 4, status);
  5084             break;
  5086         case URX_STO_SP:
  5087             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  5088             fData[opValue] = fStack->size();
  5089             break;
  5091         case URX_LD_SP:
  5093                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  5094                 int32_t newStackSize = (int32_t)fData[opValue];
  5095                 U_ASSERT(newStackSize <= fStack->size());
  5096                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  5097                 if (newFP == (int64_t *)fp) {
  5098                     break;
  5100                 int32_t i;
  5101                 for (i=0; i<fFrameSize; i++) {
  5102                     newFP[i] = ((int64_t *)fp)[i];
  5104                 fp = (REStackFrame *)newFP;
  5105                 fStack->setSize(newStackSize);
  5107             break;
  5109         case URX_BACKREF:
  5111                 U_ASSERT(opValue < fFrameSize);
  5112                 int64_t groupStartIdx = fp->fExtra[opValue];
  5113                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
  5114                 U_ASSERT(groupStartIdx <= groupEndIdx);
  5115                 int64_t inputIndex = fp->fInputIdx;
  5116                 if (groupStartIdx < 0) {
  5117                     // This capture group has not participated in the match thus far,
  5118                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  5119                     break;
  5121                 UBool success = TRUE;
  5122                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
  5123                     if (inputIndex >= fActiveLimit) {
  5124                         success = FALSE;
  5125                         fHitEnd = TRUE;
  5126                         break;
  5128                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
  5129                         success = FALSE;
  5130                         break;
  5133                 if (success) {
  5134                     fp->fInputIdx = inputIndex;
  5135                 } else {
  5136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5139             break;
  5141         case URX_BACKREF_I:
  5143                 U_ASSERT(opValue < fFrameSize);
  5144                 int64_t groupStartIdx = fp->fExtra[opValue];
  5145                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
  5146                 U_ASSERT(groupStartIdx <= groupEndIdx);
  5147                 if (groupStartIdx < 0) {
  5148                     // This capture group has not participated in the match thus far,
  5149                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  5150                     break;
  5152                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
  5153                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
  5155                 //   Note: if the capture group match was of an empty string the backref
  5156                 //         match succeeds.  Verified by testing:  Perl matches succeed 
  5157                 //         in this case, so we do too.
  5159                 UBool success = TRUE;
  5160                 for (;;) {
  5161                     UChar32 captureGroupChar = captureGroupItr.next();
  5162                     if (captureGroupChar == U_SENTINEL) {
  5163                         success = TRUE;
  5164                         break;
  5166                     UChar32 inputChar = inputItr.next();
  5167                     if (inputChar == U_SENTINEL) {
  5168                         success = FALSE;
  5169                         fHitEnd = TRUE;
  5170                         break;
  5172                     if (inputChar != captureGroupChar) {
  5173                         success = FALSE;
  5174                         break;
  5178                 if (success && inputItr.inExpansion()) {
  5179                     // We otained a match by consuming part of a string obtained from 
  5180                     // case-folding a single code point of the input text.  
  5181                     // This does not count as an overall match.
  5182                     success = FALSE;
  5185                 if (success) {
  5186                     fp->fInputIdx = inputItr.getIndex();
  5187                 } else {
  5188                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5191             break;
  5193         case URX_STO_INP_LOC:
  5195                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
  5196                 fp->fExtra[opValue] = fp->fInputIdx;
  5198             break;
  5200         case URX_JMPX:
  5202                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  5203                 fp->fPatIdx += 1;
  5204                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
  5205                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
  5206                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
  5207                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
  5208                 if (savedInputIdx < fp->fInputIdx) {
  5209                     fp->fPatIdx = opValue;                               // JMP
  5210                 } else {
  5211                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
  5214             break;
  5216         case URX_LA_START:
  5218                 // Entering a lookahead block.
  5219                 // Save Stack Ptr, Input Pos.
  5220                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5221                 fData[opValue]   = fStack->size();
  5222                 fData[opValue+1] = fp->fInputIdx;
  5223                 fActiveStart     = fLookStart;          // Set the match region change for
  5224                 fActiveLimit     = fLookLimit;          //   transparent bounds.
  5226             break;
  5228         case URX_LA_END:
  5230                 // Leaving a look-ahead block.
  5231                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
  5232                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5233                 int32_t stackSize = fStack->size();
  5234                 int32_t newStackSize = (int32_t)fData[opValue];
  5235                 U_ASSERT(stackSize >= newStackSize);
  5236                 if (stackSize > newStackSize) {
  5237                     // Copy the current top frame back to the new (cut back) top frame.
  5238                     //   This makes the capture groups from within the look-ahead
  5239                     //   expression available.
  5240                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  5241                     int32_t i;
  5242                     for (i=0; i<fFrameSize; i++) {
  5243                         newFP[i] = ((int64_t *)fp)[i];
  5245                     fp = (REStackFrame *)newFP;
  5246                     fStack->setSize(newStackSize);
  5248                 fp->fInputIdx = fData[opValue+1];
  5250                 // Restore the active region bounds in the input string; they may have
  5251                 //    been changed because of transparent bounds on a Region.
  5252                 fActiveStart = fRegionStart;
  5253                 fActiveLimit = fRegionLimit;
  5255             break;
  5257         case URX_ONECHAR_I:
  5258             if (fp->fInputIdx < fActiveLimit) {
  5259                 UChar32 c; 
  5260                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  5261                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
  5262                     break;
  5264             } else {
  5265                 fHitEnd = TRUE;
  5267             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5268             break;
  5270         case URX_STRING_I:
  5271             // Case-insensitive test input against a literal string.
  5272             // Strings require two slots in the compiled pattern, one for the
  5273             //   offset to the string text, and one for the length.
  5274             //   The compiled string has already been case folded.
  5276                 const UChar *patternString = litText + opValue;
  5278                 op      = (int32_t)pat[fp->fPatIdx];
  5279                 fp->fPatIdx++;
  5280                 opType  = URX_TYPE(op);
  5281                 opValue = URX_VAL(op);
  5282                 U_ASSERT(opType == URX_STRING_LEN);
  5283                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
  5285                 UChar32      cText;
  5286                 UChar32      cPattern;
  5287                 UBool        success = TRUE;
  5288                 int32_t      patternStringIdx  = 0;
  5289                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
  5290                 while (patternStringIdx < patternStringLen) {
  5291                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
  5292                     cText = inputIterator.next();
  5293                     if (cText != cPattern) {
  5294                         success = FALSE;
  5295                         if (cText == U_SENTINEL) {
  5296                             fHitEnd = TRUE;
  5298                         break;
  5301                 if (inputIterator.inExpansion()) {
  5302                     success = FALSE;
  5305                 if (success) {
  5306                     fp->fInputIdx = inputIterator.getIndex();
  5307                 } else {
  5308                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5311             break;
  5313         case URX_LB_START:
  5315                 // Entering a look-behind block.
  5316                 // Save Stack Ptr, Input Pos.
  5317                 //   TODO:  implement transparent bounds.  Ticket #6067
  5318                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5319                 fData[opValue]   = fStack->size();
  5320                 fData[opValue+1] = fp->fInputIdx;
  5321                 // Init the variable containing the start index for attempted matches.
  5322                 fData[opValue+2] = -1;
  5323                 // Save input string length, then reset to pin any matches to end at
  5324                 //   the current position.
  5325                 fData[opValue+3] = fActiveLimit;
  5326                 fActiveLimit     = fp->fInputIdx;
  5328             break;
  5331         case URX_LB_CONT:
  5333                 // Positive Look-Behind, at top of loop checking for matches of LB expression
  5334                 //    at all possible input starting positions.
  5336                 // Fetch the min and max possible match lengths.  They are the operands
  5337                 //   of this op in the pattern.
  5338                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
  5339                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
  5340                 U_ASSERT(minML <= maxML);
  5341                 U_ASSERT(minML >= 0);
  5343                 // Fetch (from data) the last input index where a match was attempted.
  5344                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5345                 int64_t  *lbStartIdx = &fData[opValue+2];
  5346                 if (*lbStartIdx < 0) {
  5347                     // First time through loop.
  5348                     *lbStartIdx = fp->fInputIdx - minML;
  5349                 } else {
  5350                     // 2nd through nth time through the loop.
  5351                     // Back up start position for match by one.
  5352                     if (*lbStartIdx == 0) {
  5353                         (*lbStartIdx)--;
  5354                     } else {
  5355                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
  5359                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  5360                     // We have tried all potential match starting points without
  5361                     //  getting a match.  Backtrack out, and out of the
  5362                     //   Look Behind altogether.
  5363                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5364                     int64_t restoreInputLen = fData[opValue+3];
  5365                     U_ASSERT(restoreInputLen >= fActiveLimit);
  5366                     U_ASSERT(restoreInputLen <= fInputLength);
  5367                     fActiveLimit = restoreInputLen;
  5368                     break;
  5371                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  5372                 //      (successful match will fall off the end of the loop.)
  5373                 fp = StateSave(fp, fp->fPatIdx-3, status);
  5374                 fp->fInputIdx =  *lbStartIdx;
  5376             break;
  5378         case URX_LB_END:
  5379             // End of a look-behind block, after a successful match.
  5381                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5382                 if (fp->fInputIdx != fActiveLimit) {
  5383                     //  The look-behind expression matched, but the match did not
  5384                     //    extend all the way to the point that we are looking behind from.
  5385                     //  FAIL out of here, which will take us back to the LB_CONT, which
  5386                     //     will retry the match starting at another position or fail
  5387                     //     the look-behind altogether, whichever is appropriate.
  5388                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5389                     break;
  5392                 // Look-behind match is good.  Restore the orignal input string length,
  5393                 //   which had been truncated to pin the end of the lookbehind match to the 
  5394                 //   position being looked-behind.
  5395                 int64_t originalInputLen = fData[opValue+3];
  5396                 U_ASSERT(originalInputLen >= fActiveLimit);
  5397                 U_ASSERT(originalInputLen <= fInputLength);
  5398                 fActiveLimit = originalInputLen;
  5400             break;
  5403         case URX_LBN_CONT:
  5405                 // Negative Look-Behind, at top of loop checking for matches of LB expression
  5406                 //    at all possible input starting positions.
  5408                 // Fetch the extra parameters of this op.
  5409                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
  5410                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
  5411                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
  5412                 continueLoc = URX_VAL(continueLoc);
  5413                 U_ASSERT(minML <= maxML);
  5414                 U_ASSERT(minML >= 0);
  5415                 U_ASSERT(continueLoc > fp->fPatIdx);
  5417                 // Fetch (from data) the last input index where a match was attempted.
  5418                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5419                 int64_t  *lbStartIdx = &fData[opValue+2];
  5420                 if (*lbStartIdx < 0) {
  5421                     // First time through loop.
  5422                     *lbStartIdx = fp->fInputIdx - minML;
  5423                 } else {
  5424                     // 2nd through nth time through the loop.
  5425                     // Back up start position for match by one.
  5426                     if (*lbStartIdx == 0) {
  5427                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
  5428                     } else {
  5429                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
  5433                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  5434                     // We have tried all potential match starting points without
  5435                     //  getting a match, which means that the negative lookbehind as
  5436                     //  a whole has succeeded.  Jump forward to the continue location
  5437                     int64_t restoreInputLen = fData[opValue+3];
  5438                     U_ASSERT(restoreInputLen >= fActiveLimit);
  5439                     U_ASSERT(restoreInputLen <= fInputLength);
  5440                     fActiveLimit = restoreInputLen;
  5441                     fp->fPatIdx = continueLoc;
  5442                     break;
  5445                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  5446                 //      (successful match will cause a FAIL out of the loop altogether.)
  5447                 fp = StateSave(fp, fp->fPatIdx-4, status);
  5448                 fp->fInputIdx =  *lbStartIdx;
  5450             break;
  5452         case URX_LBN_END:
  5453             // End of a negative look-behind block, after a successful match.
  5455                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5456                 if (fp->fInputIdx != fActiveLimit) {
  5457                     //  The look-behind expression matched, but the match did not
  5458                     //    extend all the way to the point that we are looking behind from.
  5459                     //  FAIL out of here, which will take us back to the LB_CONT, which
  5460                     //     will retry the match starting at another position or succeed
  5461                     //     the look-behind altogether, whichever is appropriate.
  5462                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5463                     break;
  5466                 // Look-behind expression matched, which means look-behind test as
  5467                 //   a whole Fails
  5469                 //   Restore the orignal input string length, which had been truncated 
  5470                 //   inorder to pin the end of the lookbehind match  
  5471                 //   to the position being looked-behind.
  5472                 int64_t originalInputLen = fData[opValue+3];
  5473                 U_ASSERT(originalInputLen >= fActiveLimit);
  5474                 U_ASSERT(originalInputLen <= fInputLength);
  5475                 fActiveLimit = originalInputLen;
  5477                 // Restore original stack position, discarding any state saved
  5478                 //   by the successful pattern match.
  5479                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  5480                 int32_t newStackSize = (int32_t)fData[opValue];
  5481                 U_ASSERT(fStack->size() > newStackSize);
  5482                 fStack->setSize(newStackSize);
  5484                 //  FAIL, which will take control back to someplace 
  5485                 //  prior to entering the look-behind test.
  5486                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  5488             break;
  5491         case URX_LOOP_SR_I:
  5492             // Loop Initialization for the optimized implementation of
  5493             //     [some character set]*
  5494             //   This op scans through all matching input.
  5495             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  5497                 U_ASSERT(opValue > 0 && opValue < sets->size());
  5498                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  5499                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
  5501                 // Loop through input, until either the input is exhausted or
  5502                 //   we reach a character that is not a member of the set.
  5503                 int32_t ix = (int32_t)fp->fInputIdx;
  5504                 for (;;) {
  5505                     if (ix >= fActiveLimit) {
  5506                         fHitEnd = TRUE;
  5507                         break;
  5509                     UChar32   c;
  5510                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
  5511                     if (c<256) {
  5512                         if (s8->contains(c) == FALSE) {
  5513                             U16_BACK_1(inputBuf, 0, ix);
  5514                             break;
  5516                     } else {
  5517                         if (s->contains(c) == FALSE) {
  5518                             U16_BACK_1(inputBuf, 0, ix);
  5519                             break;
  5524                 // If there were no matching characters, skip over the loop altogether.
  5525                 //   The loop doesn't run at all, a * op always succeeds.
  5526                 if (ix == fp->fInputIdx) {
  5527                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
  5528                     break;
  5531                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  5532                 //   must follow.  It's operand is the stack location
  5533                 //   that holds the starting input index for the match of this [set]*
  5534                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  5535                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  5536                 int32_t stackLoc = URX_VAL(loopcOp);
  5537                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  5538                 fp->fExtra[stackLoc] = fp->fInputIdx;
  5539                 fp->fInputIdx = ix;
  5541                 // Save State to the URX_LOOP_C op that follows this one,
  5542                 //   so that match failures in the following code will return to there.
  5543                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  5544                 fp = StateSave(fp, fp->fPatIdx, status);
  5545                 fp->fPatIdx++;
  5547             break;
  5550         case URX_LOOP_DOT_I:
  5551             // Loop Initialization for the optimized implementation of .*
  5552             //   This op scans through all remaining input.
  5553             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  5555                 // Loop through input until the input is exhausted (we reach an end-of-line)
  5556                 // In DOTALL mode, we can just go straight to the end of the input.
  5557                 int32_t ix;
  5558                 if ((opValue & 1) == 1) {
  5559                     // Dot-matches-All mode.  Jump straight to the end of the string.
  5560                     ix = (int32_t)fActiveLimit;
  5561                     fHitEnd = TRUE;
  5562                 } else {
  5563                     // NOT DOT ALL mode.  Line endings do not match '.'
  5564                     // Scan forward until a line ending or end of input.
  5565                     ix = (int32_t)fp->fInputIdx;
  5566                     for (;;) {
  5567                         if (ix >= fActiveLimit) {
  5568                             fHitEnd = TRUE;
  5569                             break;
  5571                         UChar32   c;
  5572                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
  5573                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
  5574                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
  5575                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
  5576                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
  5577                                 //  char is a line ending.  Put the input pos back to the
  5578                                 //    line ending char, and exit the scanning loop.
  5579                                 U16_BACK_1(inputBuf, 0, ix);
  5580                                 break;
  5586                 // If there were no matching characters, skip over the loop altogether.
  5587                 //   The loop doesn't run at all, a * op always succeeds.
  5588                 if (ix == fp->fInputIdx) {
  5589                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
  5590                     break;
  5593                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  5594                 //   must follow.  It's operand is the stack location
  5595                 //   that holds the starting input index for the match of this .*
  5596                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  5597                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  5598                 int32_t stackLoc = URX_VAL(loopcOp);
  5599                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  5600                 fp->fExtra[stackLoc] = fp->fInputIdx;
  5601                 fp->fInputIdx = ix;
  5603                 // Save State to the URX_LOOP_C op that follows this one,
  5604                 //   so that match failures in the following code will return to there.
  5605                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  5606                 fp = StateSave(fp, fp->fPatIdx, status);
  5607                 fp->fPatIdx++;
  5609             break;
  5612         case URX_LOOP_C:
  5614                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
  5615                 backSearchIndex = (int32_t)fp->fExtra[opValue];
  5616                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
  5617                 if (backSearchIndex == fp->fInputIdx) {
  5618                     // We've backed up the input idx to the point that the loop started.
  5619                     // The loop is done.  Leave here without saving state.   
  5620                     //  Subsequent failures won't come back here.
  5621                     break;
  5623                 // Set up for the next iteration of the loop, with input index
  5624                 //   backed up by one from the last time through,
  5625                 //   and a state save to this instruction in case the following code fails again.
  5626                 //   (We're going backwards because this loop emulates stack unwinding, not
  5627                 //    the initial scan forward.)
  5628                 U_ASSERT(fp->fInputIdx > 0);
  5629                 UChar32 prevC;
  5630                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
  5632                 if (prevC == 0x0a && 
  5633                     fp->fInputIdx > backSearchIndex &&
  5634                     inputBuf[fp->fInputIdx-1] == 0x0d) {
  5635                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
  5636                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
  5637                         // .*, stepping back over CRLF pair.
  5638                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
  5643                 fp = StateSave(fp, fp->fPatIdx-1, status);
  5645             break;
  5649         default:
  5650             // Trouble.  The compiled pattern contains an entry with an
  5651             //           unrecognized type tag.
  5652             U_ASSERT(FALSE);
  5655         if (U_FAILURE(status)) {
  5656             isMatch = FALSE;
  5657             break;
  5661 breakFromLoop:
  5662     fMatch = isMatch;
  5663     if (isMatch) {
  5664         fLastMatchEnd = fMatchEnd;
  5665         fMatchStart   = startIdx;
  5666         fMatchEnd     = fp->fInputIdx;
  5667         if (fTraceDebug) {
  5668             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
  5671     else
  5673         if (fTraceDebug) {
  5674             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
  5678     fFrame = fp;                // The active stack frame when the engine stopped.
  5679     //   Contains the capture group results that we need to
  5680     //    access later.
  5682     return;
  5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
  5688 U_NAMESPACE_END
  5690 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial