intl/icu/source/i18n/rematch.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rematch.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,5690 @@
     1.4 +/*
     1.5 +**************************************************************************
     1.6 +*   Copyright (C) 2002-2013 International Business Machines Corporation  *
     1.7 +*   and others. All rights reserved.                                     *
     1.8 +**************************************************************************
     1.9 +*/
    1.10 +//
    1.11 +//  file:  rematch.cpp
    1.12 +//
    1.13 +//         Contains the implementation of class RegexMatcher,
    1.14 +//         which is one of the main API classes for the ICU regular expression package.
    1.15 +//
    1.16 +
    1.17 +#include "unicode/utypes.h"
    1.18 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.19 +
    1.20 +#include "unicode/regex.h"
    1.21 +#include "unicode/uniset.h"
    1.22 +#include "unicode/uchar.h"
    1.23 +#include "unicode/ustring.h"
    1.24 +#include "unicode/rbbi.h"
    1.25 +#include "unicode/utf.h"
    1.26 +#include "unicode/utf16.h"
    1.27 +#include "uassert.h"
    1.28 +#include "cmemory.h"
    1.29 +#include "uvector.h"
    1.30 +#include "uvectr32.h"
    1.31 +#include "uvectr64.h"
    1.32 +#include "regeximp.h"
    1.33 +#include "regexst.h"
    1.34 +#include "regextxt.h"
    1.35 +#include "ucase.h"
    1.36 +
    1.37 +// #include <malloc.h>        // Needed for heapcheck testing
    1.38 +
    1.39 +
    1.40 +// Find progress callback
    1.41 +// ----------------------
    1.42 +// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
    1.43 +//
    1.44 +#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
    1.45 +    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
    1.46 +
    1.47 +
    1.48 +// Smart Backtracking
    1.49 +// ------------------
    1.50 +// When a failure would go back to a LOOP_C instruction,
    1.51 +// strings, characters, and setrefs scan backwards for a valid start
    1.52 +// character themselves, pop the stack, and save state, emulating the
    1.53 +// LOOP_C's effect but assured that the next character of input is a
    1.54 +// possible matching character.
    1.55 +//
    1.56 +// Good idea in theory; unfortunately it only helps out a few specific
    1.57 +// cases and slows the engine down a little in the rest.
    1.58 +
    1.59 +U_NAMESPACE_BEGIN
    1.60 +
    1.61 +// Default limit for the size of the back track stack, to avoid system
    1.62 +//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
    1.63 +// This value puts ICU's limits higher than most other regexp implementations,
    1.64 +//    which use recursion rather than the heap, and take more storage per
    1.65 +//    backtrack point.
    1.66 +//
    1.67 +static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
    1.68 +
    1.69 +// Time limit counter constant.
    1.70 +//   Time limits for expression evaluation are in terms of quanta of work by
    1.71 +//   the engine, each of which is 10,000 state saves.
    1.72 +//   This constant determines that state saves per tick number.
    1.73 +static const int32_t TIMER_INITIAL_VALUE = 10000;
    1.74 +
    1.75 +//-----------------------------------------------------------------------------
    1.76 +//
    1.77 +//   Constructor and Destructor
    1.78 +//
    1.79 +//-----------------------------------------------------------------------------
    1.80 +RegexMatcher::RegexMatcher(const RegexPattern *pat)  { 
    1.81 +    fDeferredStatus = U_ZERO_ERROR;
    1.82 +    init(fDeferredStatus);
    1.83 +    if (U_FAILURE(fDeferredStatus)) {
    1.84 +        return;
    1.85 +    }
    1.86 +    if (pat==NULL) {
    1.87 +        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
    1.88 +        return;
    1.89 +    }
    1.90 +    fPattern = pat;
    1.91 +    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
    1.92 +}
    1.93 +
    1.94 +
    1.95 +
    1.96 +RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
    1.97 +                           uint32_t flags, UErrorCode &status) {
    1.98 +    init(status);
    1.99 +    if (U_FAILURE(status)) {
   1.100 +        return;
   1.101 +    }
   1.102 +    UParseError    pe;
   1.103 +    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   1.104 +    fPattern           = fPatternOwned;
   1.105 +    
   1.106 +    UText inputText = UTEXT_INITIALIZER;
   1.107 +    utext_openConstUnicodeString(&inputText, &input, &status);
   1.108 +    init2(&inputText, status);
   1.109 +    utext_close(&inputText);
   1.110 +
   1.111 +    fInputUniStrMaybeMutable = TRUE;    
   1.112 +}
   1.113 +
   1.114 +
   1.115 +RegexMatcher::RegexMatcher(UText *regexp, UText *input,
   1.116 +                           uint32_t flags, UErrorCode &status) {
   1.117 +    init(status);
   1.118 +    if (U_FAILURE(status)) {
   1.119 +        return;
   1.120 +    }
   1.121 +    UParseError    pe;
   1.122 +    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   1.123 +    if (U_FAILURE(status)) {
   1.124 +        return;
   1.125 +    }
   1.126 +
   1.127 +    fPattern           = fPatternOwned;
   1.128 +    init2(input, status);
   1.129 +}
   1.130 +
   1.131 +
   1.132 +RegexMatcher::RegexMatcher(const UnicodeString &regexp, 
   1.133 +                           uint32_t flags, UErrorCode &status) {
   1.134 +    init(status);
   1.135 +    if (U_FAILURE(status)) {
   1.136 +        return;
   1.137 +    }
   1.138 +    UParseError    pe;
   1.139 +    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   1.140 +    if (U_FAILURE(status)) {
   1.141 +        return;
   1.142 +    }
   1.143 +    fPattern           = fPatternOwned;
   1.144 +    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
   1.145 +}
   1.146 +
   1.147 +RegexMatcher::RegexMatcher(UText *regexp, 
   1.148 +                           uint32_t flags, UErrorCode &status) {
   1.149 +    init(status);
   1.150 +    if (U_FAILURE(status)) {
   1.151 +        return;
   1.152 +    }
   1.153 +    UParseError    pe;
   1.154 +    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
   1.155 +        if (U_FAILURE(status)) {
   1.156 +        return;
   1.157 +    }
   1.158 +
   1.159 +    fPattern           = fPatternOwned;
   1.160 +    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
   1.161 +}
   1.162 +
   1.163 +
   1.164 +
   1.165 +
   1.166 +RegexMatcher::~RegexMatcher() {
   1.167 +    delete fStack;
   1.168 +    if (fData != fSmallData) {
   1.169 +        uprv_free(fData);
   1.170 +        fData = NULL;
   1.171 +    }
   1.172 +    if (fPatternOwned) {
   1.173 +        delete fPatternOwned;
   1.174 +        fPatternOwned = NULL;
   1.175 +        fPattern = NULL;
   1.176 +    }
   1.177 +    
   1.178 +    if (fInput) {
   1.179 +        delete fInput;
   1.180 +    }
   1.181 +    if (fInputText) {
   1.182 +        utext_close(fInputText);
   1.183 +    }
   1.184 +    if (fAltInputText) {
   1.185 +        utext_close(fAltInputText);
   1.186 +    }
   1.187 +    
   1.188 +    #if UCONFIG_NO_BREAK_ITERATION==0
   1.189 +    delete fWordBreakItr;
   1.190 +    #endif
   1.191 +}
   1.192 +
   1.193 +//
   1.194 +//   init()   common initialization for use by all constructors.
   1.195 +//            Initialize all fields, get the object into a consistent state.
   1.196 +//            This must be done even when the initial status shows an error,
   1.197 +//            so that the object is initialized sufficiently well for the destructor
   1.198 +//            to run safely.
   1.199 +//
   1.200 +void RegexMatcher::init(UErrorCode &status) {
   1.201 +    fPattern           = NULL;
   1.202 +    fPatternOwned      = NULL;
   1.203 +    fFrameSize         = 0;
   1.204 +    fRegionStart       = 0;
   1.205 +    fRegionLimit       = 0;
   1.206 +    fAnchorStart       = 0;
   1.207 +    fAnchorLimit       = 0;
   1.208 +    fLookStart         = 0;
   1.209 +    fLookLimit         = 0;
   1.210 +    fActiveStart       = 0;
   1.211 +    fActiveLimit       = 0;
   1.212 +    fTransparentBounds = FALSE;
   1.213 +    fAnchoringBounds   = TRUE;
   1.214 +    fMatch             = FALSE;
   1.215 +    fMatchStart        = 0;
   1.216 +    fMatchEnd          = 0;
   1.217 +    fLastMatchEnd      = -1;
   1.218 +    fAppendPosition    = 0;
   1.219 +    fHitEnd            = FALSE;
   1.220 +    fRequireEnd        = FALSE;
   1.221 +    fStack             = NULL;
   1.222 +    fFrame             = NULL;
   1.223 +    fTimeLimit         = 0;
   1.224 +    fTime              = 0;
   1.225 +    fTickCounter       = 0;
   1.226 +    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
   1.227 +    fCallbackFn        = NULL;
   1.228 +    fCallbackContext   = NULL;
   1.229 +    fFindProgressCallbackFn      = NULL;
   1.230 +    fFindProgressCallbackContext = NULL;
   1.231 +    fTraceDebug        = FALSE;
   1.232 +    fDeferredStatus    = status;
   1.233 +    fData              = fSmallData;
   1.234 +    fWordBreakItr      = NULL;
   1.235 +    
   1.236 +    fStack             = NULL;
   1.237 +    fInputText         = NULL;
   1.238 +    fAltInputText      = NULL;
   1.239 +    fInput             = NULL;
   1.240 +    fInputLength       = 0;
   1.241 +    fInputUniStrMaybeMutable = FALSE;
   1.242 +
   1.243 +    if (U_FAILURE(status)) {
   1.244 +        fDeferredStatus = status;
   1.245 +    }
   1.246 +}
   1.247 +
   1.248 +//
   1.249 +//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
   1.250 +//            This handles the common setup to be done after the Pattern is available.
   1.251 +//
   1.252 +void RegexMatcher::init2(UText *input, UErrorCode &status) {
   1.253 +    if (U_FAILURE(status)) {
   1.254 +        fDeferredStatus = status;
   1.255 +        return;
   1.256 +    }
   1.257 +
   1.258 +    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
   1.259 +        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 
   1.260 +        if (fData == NULL) {
   1.261 +            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.262 +            return;
   1.263 +        }
   1.264 +    }
   1.265 +
   1.266 +    fStack = new UVector64(status);
   1.267 +    if (fStack == NULL) {
   1.268 +        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.269 +        return;
   1.270 +    }
   1.271 +
   1.272 +    reset(input);
   1.273 +    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
   1.274 +    if (U_FAILURE(status)) {
   1.275 +        fDeferredStatus = status;
   1.276 +        return;
   1.277 +    }
   1.278 +}
   1.279 +
   1.280 +
   1.281 +static const UChar BACKSLASH  = 0x5c;
   1.282 +static const UChar DOLLARSIGN = 0x24;
   1.283 +//--------------------------------------------------------------------------------
   1.284 +//
   1.285 +//    appendReplacement
   1.286 +//
   1.287 +//--------------------------------------------------------------------------------
   1.288 +RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
   1.289 +                                              const UnicodeString &replacement,
   1.290 +                                              UErrorCode &status) {
   1.291 +    UText replacementText = UTEXT_INITIALIZER;
   1.292 +    
   1.293 +    utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1.294 +    if (U_SUCCESS(status)) {        
   1.295 +        UText resultText = UTEXT_INITIALIZER;
   1.296 +        utext_openUnicodeString(&resultText, &dest, &status);
   1.297 +        
   1.298 +        if (U_SUCCESS(status)) {
   1.299 +            appendReplacement(&resultText, &replacementText, status);
   1.300 +            utext_close(&resultText);
   1.301 +        }
   1.302 +        utext_close(&replacementText);
   1.303 +    }
   1.304 +    
   1.305 +    return *this;
   1.306 +}
   1.307 +
   1.308 +//
   1.309 +//    appendReplacement, UText mode
   1.310 +//
   1.311 +RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
   1.312 +                                              UText *replacement,
   1.313 +                                              UErrorCode &status) {
   1.314 +    if (U_FAILURE(status)) {
   1.315 +        return *this;
   1.316 +    }
   1.317 +    if (U_FAILURE(fDeferredStatus)) {
   1.318 +        status = fDeferredStatus;
   1.319 +        return *this;
   1.320 +    }
   1.321 +    if (fMatch == FALSE) {
   1.322 +        status = U_REGEX_INVALID_STATE;
   1.323 +        return *this;
   1.324 +    }
   1.325 +    
   1.326 +    // Copy input string from the end of previous match to start of current match
   1.327 +    int64_t  destLen = utext_nativeLength(dest);
   1.328 +    if (fMatchStart > fAppendPosition) {
   1.329 +        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1.330 +            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 
   1.331 +                                     (int32_t)(fMatchStart-fAppendPosition), &status);
   1.332 +        } else {
   1.333 +            int32_t len16;
   1.334 +            if (UTEXT_USES_U16(fInputText)) {
   1.335 +                len16 = (int32_t)(fMatchStart-fAppendPosition);
   1.336 +            } else {
   1.337 +                UErrorCode lengthStatus = U_ZERO_ERROR;
   1.338 +                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
   1.339 +            }
   1.340 +            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1.341 +            if (inputChars == NULL) {
   1.342 +                status = U_MEMORY_ALLOCATION_ERROR;
   1.343 +                return *this;
   1.344 +            }
   1.345 +            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
   1.346 +            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
   1.347 +            uprv_free(inputChars);
   1.348 +        }
   1.349 +    }
   1.350 +    fAppendPosition = fMatchEnd;
   1.351 +    
   1.352 +    
   1.353 +    // scan the replacement text, looking for substitutions ($n) and \escapes.
   1.354 +    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
   1.355 +    //         move entire ranges not containing substitutions.
   1.356 +    UTEXT_SETNATIVEINDEX(replacement, 0);
   1.357 +    UChar32 c = UTEXT_NEXT32(replacement);
   1.358 +    while (c != U_SENTINEL) {
   1.359 +        if (c == BACKSLASH) {
   1.360 +            // Backslash Escape.  Copy the following char out without further checks.
   1.361 +            //                    Note:  Surrogate pairs don't need any special handling
   1.362 +            //                           The second half wont be a '$' or a '\', and
   1.363 +            //                           will move to the dest normally on the next
   1.364 +            //                           loop iteration.
   1.365 +            c = UTEXT_CURRENT32(replacement);
   1.366 +            if (c == U_SENTINEL) {
   1.367 +                break;
   1.368 +            }
   1.369 +            
   1.370 +            if (c==0x55/*U*/ || c==0x75/*u*/) {
   1.371 +                // We have a \udddd or \Udddddddd escape sequence.
   1.372 +                int32_t offset = 0;
   1.373 +                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
   1.374 +                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   1.375 +                if (escapedChar != (UChar32)0xFFFFFFFF) {
   1.376 +                    if (U_IS_BMP(escapedChar)) {
   1.377 +                        UChar c16 = (UChar)escapedChar;
   1.378 +                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   1.379 +                    } else {
   1.380 +                        UChar surrogate[2];
   1.381 +                        surrogate[0] = U16_LEAD(escapedChar);
   1.382 +                        surrogate[1] = U16_TRAIL(escapedChar);
   1.383 +                        if (U_SUCCESS(status)) {
   1.384 +                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   1.385 +                        }
   1.386 +                    }
   1.387 +                    // TODO:  Report errors for mal-formed \u escapes?
   1.388 +                    //        As this is, the original sequence is output, which may be OK.
   1.389 +                    if (context.lastOffset == offset) {
   1.390 +                        (void)UTEXT_PREVIOUS32(replacement);
   1.391 +                    } else if (context.lastOffset != offset-1) {
   1.392 +                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
   1.393 +                    }
   1.394 +                }
   1.395 +            } else {
   1.396 +                (void)UTEXT_NEXT32(replacement);
   1.397 +                // Plain backslash escape.  Just put out the escaped character.
   1.398 +                if (U_IS_BMP(c)) {
   1.399 +                    UChar c16 = (UChar)c;
   1.400 +                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   1.401 +                } else {
   1.402 +                    UChar surrogate[2];
   1.403 +                    surrogate[0] = U16_LEAD(c);
   1.404 +                    surrogate[1] = U16_TRAIL(c);
   1.405 +                    if (U_SUCCESS(status)) {
   1.406 +                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   1.407 +                    }
   1.408 +                }
   1.409 +            }
   1.410 +        } else if (c != DOLLARSIGN) {
   1.411 +            // Normal char, not a $.  Copy it out without further checks.
   1.412 +            if (U_IS_BMP(c)) {
   1.413 +                UChar c16 = (UChar)c;
   1.414 +                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   1.415 +            } else {
   1.416 +                UChar surrogate[2];
   1.417 +                surrogate[0] = U16_LEAD(c);
   1.418 +                surrogate[1] = U16_TRAIL(c);
   1.419 +                if (U_SUCCESS(status)) {
   1.420 +                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
   1.421 +                }
   1.422 +            }
   1.423 +        } else {
   1.424 +            // We've got a $.  Pick up a capture group number if one follows.
   1.425 +            // Consume at most the number of digits necessary for the largest capture
   1.426 +            // number that is valid for this pattern.
   1.427 +            
   1.428 +            int32_t numDigits = 0;
   1.429 +            int32_t groupNum  = 0;
   1.430 +            UChar32 digitC;
   1.431 +            for (;;) {
   1.432 +                digitC = UTEXT_CURRENT32(replacement);
   1.433 +                if (digitC == U_SENTINEL) {
   1.434 +                    break;
   1.435 +                }
   1.436 +                if (u_isdigit(digitC) == FALSE) {
   1.437 +                    break;
   1.438 +                }
   1.439 +                (void)UTEXT_NEXT32(replacement);
   1.440 +                groupNum=groupNum*10 + u_charDigitValue(digitC);
   1.441 +                numDigits++;
   1.442 +                if (numDigits >= fPattern->fMaxCaptureDigits) {
   1.443 +                    break;
   1.444 +                }
   1.445 +            }
   1.446 +            
   1.447 +            
   1.448 +            if (numDigits == 0) {
   1.449 +                // The $ didn't introduce a group number at all.
   1.450 +                // Treat it as just part of the substitution text.
   1.451 +                UChar c16 = DOLLARSIGN;
   1.452 +                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
   1.453 +            } else {
   1.454 +                // Finally, append the capture group data to the destination.
   1.455 +                destLen += appendGroup(groupNum, dest, status);
   1.456 +                if (U_FAILURE(status)) {
   1.457 +                    // Can fail if group number is out of range.
   1.458 +                    break;
   1.459 +                }
   1.460 +            }
   1.461 +        }
   1.462 +        
   1.463 +        if (U_FAILURE(status)) {
   1.464 +            break;
   1.465 +        } else {
   1.466 +            c = UTEXT_NEXT32(replacement);
   1.467 +        }
   1.468 +    }
   1.469 +    
   1.470 +    return *this;
   1.471 +}
   1.472 +
   1.473 +
   1.474 +
   1.475 +//--------------------------------------------------------------------------------
   1.476 +//
   1.477 +//    appendTail     Intended to be used in conjunction with appendReplacement()
   1.478 +//                   To the destination string, append everything following
   1.479 +//                   the last match position from the input string.
   1.480 +//
   1.481 +//                   Note:  Match ranges do not affect appendTail or appendReplacement
   1.482 +//
   1.483 +//--------------------------------------------------------------------------------
   1.484 +UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
   1.485 +    UErrorCode status = U_ZERO_ERROR;
   1.486 +    UText resultText = UTEXT_INITIALIZER;
   1.487 +    utext_openUnicodeString(&resultText, &dest, &status);
   1.488 +    
   1.489 +    if (U_SUCCESS(status)) {
   1.490 +        appendTail(&resultText, status);
   1.491 +        utext_close(&resultText);
   1.492 +    }
   1.493 +    
   1.494 +    return dest;
   1.495 +}
   1.496 +
   1.497 +//
   1.498 +//   appendTail, UText mode
   1.499 +//
   1.500 +UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
   1.501 +    UBool bailOut = FALSE;
   1.502 +    if (U_FAILURE(status)) {
   1.503 +        bailOut = TRUE;
   1.504 +    }
   1.505 +    if (U_FAILURE(fDeferredStatus)) {
   1.506 +        status = fDeferredStatus;
   1.507 +        bailOut = TRUE;
   1.508 +    }
   1.509 +    
   1.510 +    if (bailOut) {
   1.511 +        //  dest must not be NULL
   1.512 +        if (dest) {
   1.513 +            utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
   1.514 +            return dest;
   1.515 +        }
   1.516 +    }
   1.517 +    
   1.518 +    if (fInputLength > fAppendPosition) {
   1.519 +        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1.520 +            int64_t destLen = utext_nativeLength(dest);
   1.521 +            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 
   1.522 +                          (int32_t)(fInputLength-fAppendPosition), &status);
   1.523 +        } else {
   1.524 +            int32_t len16;
   1.525 +            if (UTEXT_USES_U16(fInputText)) {
   1.526 +                len16 = (int32_t)(fInputLength-fAppendPosition);
   1.527 +            } else {
   1.528 +                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
   1.529 +                status = U_ZERO_ERROR; // buffer overflow
   1.530 +            }
   1.531 +            
   1.532 +            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
   1.533 +            if (inputChars == NULL) {
   1.534 +                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.535 +            } else {
   1.536 +                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 
   1.537 +                int64_t destLen = utext_nativeLength(dest);
   1.538 +                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
   1.539 +                uprv_free(inputChars);
   1.540 +            }
   1.541 +        }
   1.542 +    }
   1.543 +    return dest;
   1.544 +}
   1.545 +
   1.546 +
   1.547 +
   1.548 +//--------------------------------------------------------------------------------
   1.549 +//
   1.550 +//   end
   1.551 +//
   1.552 +//--------------------------------------------------------------------------------
   1.553 +int32_t RegexMatcher::end(UErrorCode &err) const {
   1.554 +    return end(0, err);
   1.555 +}
   1.556 +
   1.557 +int64_t RegexMatcher::end64(UErrorCode &err) const {
   1.558 +    return end64(0, err);
   1.559 +}
   1.560 +
   1.561 +int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
   1.562 +    if (U_FAILURE(err)) {
   1.563 +        return -1;
   1.564 +    }
   1.565 +    if (fMatch == FALSE) {
   1.566 +        err = U_REGEX_INVALID_STATE;
   1.567 +        return -1;
   1.568 +    }
   1.569 +    if (group < 0 || group > fPattern->fGroupMap->size()) {
   1.570 +        err = U_INDEX_OUTOFBOUNDS_ERROR;
   1.571 +        return -1;
   1.572 +    }
   1.573 +    int64_t e = -1;
   1.574 +    if (group == 0) {
   1.575 +        e = fMatchEnd; 
   1.576 +    } else {
   1.577 +        // Get the position within the stack frame of the variables for
   1.578 +        //    this capture group.
   1.579 +        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   1.580 +        U_ASSERT(groupOffset < fPattern->fFrameSize);
   1.581 +        U_ASSERT(groupOffset >= 0);
   1.582 +        e = fFrame->fExtra[groupOffset + 1];
   1.583 +    }
   1.584 +    
   1.585 +        return e;
   1.586 +}
   1.587 +
   1.588 +int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
   1.589 +    return (int32_t)end64(group, err);
   1.590 +}
   1.591 +
   1.592 +
   1.593 +//--------------------------------------------------------------------------------
   1.594 +//
   1.595 +//   find()
   1.596 +//
   1.597 +//--------------------------------------------------------------------------------
   1.598 +UBool RegexMatcher::find() {
   1.599 +    // Start at the position of the last match end.  (Will be zero if the
   1.600 +    //   matcher has been reset.)
   1.601 +    //
   1.602 +    if (U_FAILURE(fDeferredStatus)) {
   1.603 +        return FALSE;
   1.604 +    }
   1.605 +    
   1.606 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1.607 +        return findUsingChunk();
   1.608 +    }
   1.609 +
   1.610 +    int64_t startPos = fMatchEnd;
   1.611 +    if (startPos==0) {
   1.612 +        startPos = fActiveStart;
   1.613 +    }
   1.614 +
   1.615 +    if (fMatch) {
   1.616 +        // Save the position of any previous successful match.
   1.617 +        fLastMatchEnd = fMatchEnd;
   1.618 +
   1.619 +        if (fMatchStart == fMatchEnd) {
   1.620 +            // Previous match had zero length.  Move start position up one position
   1.621 +            //  to avoid sending find() into a loop on zero-length matches.
   1.622 +            if (startPos >= fActiveLimit) {
   1.623 +                fMatch = FALSE;
   1.624 +                fHitEnd = TRUE;
   1.625 +                return FALSE;
   1.626 +            }
   1.627 +            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.628 +            (void)UTEXT_NEXT32(fInputText);
   1.629 +            startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.630 +        }
   1.631 +    } else {
   1.632 +        if (fLastMatchEnd >= 0) {
   1.633 +            // A previous find() failed to match.  Don't try again.
   1.634 +            //   (without this test, a pattern with a zero-length match
   1.635 +            //    could match again at the end of an input string.)
   1.636 +            fHitEnd = TRUE;
   1.637 +            return FALSE;
   1.638 +        }
   1.639 +    }
   1.640 +
   1.641 +
   1.642 +    // Compute the position in the input string beyond which a match can not begin, because
   1.643 +    //   the minimum length match would extend past the end of the input.
   1.644 +    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
   1.645 +    //          Be aware of possible overflows if making changes here.
   1.646 +    int64_t testStartLimit;
   1.647 +    if (UTEXT_USES_U16(fInputText)) {
   1.648 +        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
   1.649 +        if (startPos > testStartLimit) {
   1.650 +            fMatch = FALSE;
   1.651 +            fHitEnd = TRUE;
   1.652 +            return FALSE;
   1.653 +        }
   1.654 +    } else {
   1.655 +        // For now, let the matcher discover that it can't match on its own
   1.656 +        // We don't know how long the match len is in native characters
   1.657 +        testStartLimit = fActiveLimit;
   1.658 +    }
   1.659 +
   1.660 +    UChar32  c;
   1.661 +    U_ASSERT(startPos >= 0);
   1.662 +
   1.663 +    switch (fPattern->fStartType) {
   1.664 +    case START_NO_INFO:
   1.665 +        // No optimization was found. 
   1.666 +        //  Try a match at each input position.
   1.667 +        for (;;) {
   1.668 +            MatchAt(startPos, FALSE, fDeferredStatus);
   1.669 +            if (U_FAILURE(fDeferredStatus)) {
   1.670 +                return FALSE;
   1.671 +            }
   1.672 +            if (fMatch) {
   1.673 +                return TRUE;
   1.674 +            }
   1.675 +            if (startPos >= testStartLimit) {
   1.676 +                fHitEnd = TRUE;
   1.677 +                return FALSE;
   1.678 +            }
   1.679 +            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.680 +            (void)UTEXT_NEXT32(fInputText);
   1.681 +            startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.682 +            // Note that it's perfectly OK for a pattern to have a zero-length
   1.683 +            //   match at the end of a string, so we must make sure that the loop
   1.684 +            //   runs with startPos == testStartLimit the last time through.
   1.685 +            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.686 +                return FALSE;
   1.687 +        }
   1.688 +        U_ASSERT(FALSE);
   1.689 +
   1.690 +    case START_START:
   1.691 +        // Matches are only possible at the start of the input string
   1.692 +        //   (pattern begins with ^ or \A)
   1.693 +        if (startPos > fActiveStart) {
   1.694 +            fMatch = FALSE;
   1.695 +            return FALSE;
   1.696 +        }
   1.697 +        MatchAt(startPos, FALSE, fDeferredStatus);
   1.698 +        if (U_FAILURE(fDeferredStatus)) {
   1.699 +            return FALSE;
   1.700 +        }
   1.701 +        return fMatch;
   1.702 +
   1.703 +
   1.704 +    case START_SET:
   1.705 +        {
   1.706 +            // Match may start on any char from a pre-computed set.
   1.707 +            U_ASSERT(fPattern->fMinMatchLen > 0);
   1.708 +            int64_t pos;
   1.709 +            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.710 +            for (;;) {
   1.711 +                c = UTEXT_NEXT32(fInputText);
   1.712 +                pos = UTEXT_GETNATIVEINDEX(fInputText);
   1.713 +                // c will be -1 (U_SENTINEL) at end of text, in which case we
   1.714 +                // skip this next block (so we don't have a negative array index)
   1.715 +                // and handle end of text in the following block.
   1.716 +                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   1.717 +                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
   1.718 +                    MatchAt(startPos, FALSE, fDeferredStatus);
   1.719 +                    if (U_FAILURE(fDeferredStatus)) {
   1.720 +                        return FALSE;
   1.721 +                    }
   1.722 +                    if (fMatch) {
   1.723 +                        return TRUE;
   1.724 +                    }
   1.725 +                    UTEXT_SETNATIVEINDEX(fInputText, pos);
   1.726 +                }
   1.727 +                if (startPos >= testStartLimit) {
   1.728 +                    fMatch = FALSE;
   1.729 +                    fHitEnd = TRUE;
   1.730 +                    return FALSE;
   1.731 +                }
   1.732 +                startPos = pos;
   1.733 +	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.734 +                    return FALSE;
   1.735 +            }
   1.736 +        }
   1.737 +        U_ASSERT(FALSE);
   1.738 +
   1.739 +    case START_STRING:
   1.740 +    case START_CHAR:
   1.741 +        {
   1.742 +            // Match starts on exactly one char.
   1.743 +            U_ASSERT(fPattern->fMinMatchLen > 0);
   1.744 +            UChar32 theChar = fPattern->fInitialChar;
   1.745 +            int64_t pos;
   1.746 +            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.747 +            for (;;) {
   1.748 +                c = UTEXT_NEXT32(fInputText);
   1.749 +                pos = UTEXT_GETNATIVEINDEX(fInputText);
   1.750 +                if (c == theChar) {
   1.751 +                    MatchAt(startPos, FALSE, fDeferredStatus);
   1.752 +                    if (U_FAILURE(fDeferredStatus)) {
   1.753 +                        return FALSE;
   1.754 +                    }
   1.755 +                    if (fMatch) {
   1.756 +                        return TRUE;
   1.757 +                    }
   1.758 +                    UTEXT_SETNATIVEINDEX(fInputText, pos);
   1.759 +                }
   1.760 +                if (startPos >= testStartLimit) {
   1.761 +                    fMatch = FALSE;
   1.762 +                    fHitEnd = TRUE;
   1.763 +                    return FALSE;
   1.764 +                }
   1.765 +                startPos = pos;
   1.766 +	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.767 +                    return FALSE;
   1.768 +           }
   1.769 +        }
   1.770 +        U_ASSERT(FALSE);
   1.771 +
   1.772 +    case START_LINE:
   1.773 +        {
   1.774 +            UChar32  c;
   1.775 +            if (startPos == fAnchorStart) {
   1.776 +                MatchAt(startPos, FALSE, fDeferredStatus);
   1.777 +                if (U_FAILURE(fDeferredStatus)) {
   1.778 +                    return FALSE;
   1.779 +                }
   1.780 +                if (fMatch) {
   1.781 +                    return TRUE;
   1.782 +                }
   1.783 +                UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.784 +                c = UTEXT_NEXT32(fInputText);
   1.785 +                startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.786 +            } else {
   1.787 +                UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.788 +                c = UTEXT_PREVIOUS32(fInputText);
   1.789 +                UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.790 +            }
   1.791 +
   1.792 +            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1.793 +                for (;;) {
   1.794 +                    if (c == 0x0a) {
   1.795 +                            MatchAt(startPos, FALSE, fDeferredStatus);
   1.796 +                            if (U_FAILURE(fDeferredStatus)) {
   1.797 +                                return FALSE;
   1.798 +                            }
   1.799 +                            if (fMatch) {
   1.800 +                                return TRUE;
   1.801 +                            }
   1.802 +                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.803 +                    }
   1.804 +                    if (startPos >= testStartLimit) {
   1.805 +                        fMatch = FALSE;
   1.806 +                        fHitEnd = TRUE;
   1.807 +                        return FALSE;
   1.808 +                    }
   1.809 +                    c = UTEXT_NEXT32(fInputText);
   1.810 +                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.811 +                    // Note that it's perfectly OK for a pattern to have a zero-length
   1.812 +                    //   match at the end of a string, so we must make sure that the loop
   1.813 +                    //   runs with startPos == testStartLimit the last time through.
   1.814 +		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.815 +                        return FALSE;
   1.816 +                }
   1.817 +            } else {
   1.818 +                for (;;) {
   1.819 +                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1.820 +                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1.821 +                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   1.822 +                                (void)UTEXT_NEXT32(fInputText);
   1.823 +                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.824 +                            }
   1.825 +                            MatchAt(startPos, FALSE, fDeferredStatus);
   1.826 +                            if (U_FAILURE(fDeferredStatus)) {
   1.827 +                                return FALSE;
   1.828 +                            }
   1.829 +                            if (fMatch) {
   1.830 +                                return TRUE;
   1.831 +                            }
   1.832 +                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
   1.833 +                    }
   1.834 +                    if (startPos >= testStartLimit) {
   1.835 +                        fMatch = FALSE;
   1.836 +                        fHitEnd = TRUE;
   1.837 +                        return FALSE;
   1.838 +                    }
   1.839 +                    c = UTEXT_NEXT32(fInputText);
   1.840 +                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
   1.841 +                    // Note that it's perfectly OK for a pattern to have a zero-length
   1.842 +                    //   match at the end of a string, so we must make sure that the loop
   1.843 +                    //   runs with startPos == testStartLimit the last time through.
   1.844 +		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.845 +                        return FALSE;
   1.846 +                }
   1.847 +            }
   1.848 +        }
   1.849 +
   1.850 +    default:
   1.851 +        U_ASSERT(FALSE);
   1.852 +    }
   1.853 +
   1.854 +    U_ASSERT(FALSE);
   1.855 +    return FALSE;
   1.856 +}
   1.857 +
   1.858 +
   1.859 +
   1.860 +UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
   1.861 +    if (U_FAILURE(status)) {
   1.862 +        return FALSE;
   1.863 +    }
   1.864 +    if (U_FAILURE(fDeferredStatus)) {
   1.865 +        status = fDeferredStatus;
   1.866 +        return FALSE;
   1.867 +    }
   1.868 +    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
   1.869 +                                          //        This will reset the region to be the full input length.
   1.870 +    if (start < 0) {
   1.871 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.872 +        return FALSE;
   1.873 +    }
   1.874 +    
   1.875 +    int64_t nativeStart = start;
   1.876 +    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1.877 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1.878 +        return FALSE;
   1.879 +    }
   1.880 +    fMatchEnd = nativeStart;  
   1.881 +    return find();
   1.882 +}
   1.883 +
   1.884 +
   1.885 +//--------------------------------------------------------------------------------
   1.886 +//
   1.887 +//   findUsingChunk() -- like find(), but with the advance knowledge that the
   1.888 +//                       entire string is available in the UText's chunk buffer.
   1.889 +//
   1.890 +//--------------------------------------------------------------------------------
   1.891 +UBool RegexMatcher::findUsingChunk() {
   1.892 +    // Start at the position of the last match end.  (Will be zero if the
   1.893 +    //   matcher has been reset.
   1.894 +    //
   1.895 +
   1.896 +    int32_t startPos = (int32_t)fMatchEnd;
   1.897 +    if (startPos==0) {
   1.898 +        startPos = (int32_t)fActiveStart;
   1.899 +    }
   1.900 +    
   1.901 +    const UChar *inputBuf = fInputText->chunkContents;
   1.902 +
   1.903 +    if (fMatch) {
   1.904 +        // Save the position of any previous successful match.
   1.905 +        fLastMatchEnd = fMatchEnd;
   1.906 +        
   1.907 +        if (fMatchStart == fMatchEnd) {
   1.908 +            // Previous match had zero length.  Move start position up one position
   1.909 +            //  to avoid sending find() into a loop on zero-length matches.
   1.910 +            if (startPos >= fActiveLimit) {
   1.911 +                fMatch = FALSE;
   1.912 +                fHitEnd = TRUE;
   1.913 +                return FALSE;
   1.914 +            }
   1.915 +            U16_FWD_1(inputBuf, startPos, fInputLength);
   1.916 +        }
   1.917 +    } else {
   1.918 +        if (fLastMatchEnd >= 0) {
   1.919 +            // A previous find() failed to match.  Don't try again.
   1.920 +            //   (without this test, a pattern with a zero-length match
   1.921 +            //    could match again at the end of an input string.)
   1.922 +            fHitEnd = TRUE;
   1.923 +            return FALSE;
   1.924 +        }
   1.925 +    }
   1.926 +    
   1.927 +    
   1.928 +    // Compute the position in the input string beyond which a match can not begin, because
   1.929 +    //   the minimum length match would extend past the end of the input.
   1.930 +    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
   1.931 +    //          Be aware of possible overflows if making changes here.
   1.932 +    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
   1.933 +    if (startPos > testLen) {
   1.934 +        fMatch = FALSE;
   1.935 +        fHitEnd = TRUE;
   1.936 +        return FALSE;
   1.937 +    }
   1.938 +    
   1.939 +    UChar32  c;
   1.940 +    U_ASSERT(startPos >= 0);
   1.941 +    
   1.942 +    switch (fPattern->fStartType) {
   1.943 +    case START_NO_INFO:
   1.944 +        // No optimization was found. 
   1.945 +        //  Try a match at each input position.
   1.946 +        for (;;) {
   1.947 +            MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1.948 +            if (U_FAILURE(fDeferredStatus)) {
   1.949 +                return FALSE;
   1.950 +            }
   1.951 +            if (fMatch) {
   1.952 +                return TRUE;
   1.953 +            }
   1.954 +            if (startPos >= testLen) {
   1.955 +                fHitEnd = TRUE;
   1.956 +                return FALSE;
   1.957 +            }
   1.958 +            U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1.959 +            // Note that it's perfectly OK for a pattern to have a zero-length
   1.960 +            //   match at the end of a string, so we must make sure that the loop
   1.961 +            //   runs with startPos == testLen the last time through.
   1.962 +            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1.963 +                return FALSE;
   1.964 +        }
   1.965 +        U_ASSERT(FALSE);
   1.966 +        
   1.967 +    case START_START:
   1.968 +        // Matches are only possible at the start of the input string
   1.969 +        //   (pattern begins with ^ or \A)
   1.970 +        if (startPos > fActiveStart) {
   1.971 +            fMatch = FALSE;
   1.972 +            return FALSE;
   1.973 +        }
   1.974 +        MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1.975 +        if (U_FAILURE(fDeferredStatus)) {
   1.976 +            return FALSE;
   1.977 +        }
   1.978 +        return fMatch;
   1.979 +        
   1.980 +        
   1.981 +    case START_SET:
   1.982 +    {
   1.983 +        // Match may start on any char from a pre-computed set.
   1.984 +        U_ASSERT(fPattern->fMinMatchLen > 0);
   1.985 +        for (;;) {
   1.986 +            int32_t pos = startPos;
   1.987 +            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1.988 +            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   1.989 +                (c>=256 && fPattern->fInitialChars->contains(c))) {
   1.990 +                MatchChunkAt(pos, FALSE, fDeferredStatus);
   1.991 +                if (U_FAILURE(fDeferredStatus)) {
   1.992 +                    return FALSE;
   1.993 +                }
   1.994 +                if (fMatch) {
   1.995 +                    return TRUE;
   1.996 +                }
   1.997 +            }
   1.998 +            if (pos >= testLen) {
   1.999 +                fMatch = FALSE;
  1.1000 +                fHitEnd = TRUE;
  1.1001 +                return FALSE;
  1.1002 +            }
  1.1003 +            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1.1004 +                return FALSE;
  1.1005 +        }
  1.1006 +    }
  1.1007 +        U_ASSERT(FALSE);
  1.1008 +        
  1.1009 +    case START_STRING:
  1.1010 +    case START_CHAR:
  1.1011 +    {
  1.1012 +        // Match starts on exactly one char.
  1.1013 +        U_ASSERT(fPattern->fMinMatchLen > 0);
  1.1014 +        UChar32 theChar = fPattern->fInitialChar;
  1.1015 +        for (;;) {
  1.1016 +            int32_t pos = startPos;
  1.1017 +            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
  1.1018 +            if (c == theChar) {
  1.1019 +                MatchChunkAt(pos, FALSE, fDeferredStatus);
  1.1020 +                if (U_FAILURE(fDeferredStatus)) {
  1.1021 +                    return FALSE;
  1.1022 +                }
  1.1023 +                if (fMatch) {
  1.1024 +                    return TRUE;
  1.1025 +                }
  1.1026 +            }
  1.1027 +            if (pos >= testLen) {
  1.1028 +                fMatch = FALSE;
  1.1029 +                fHitEnd = TRUE;
  1.1030 +                return FALSE;
  1.1031 +            }
  1.1032 +            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1.1033 +                return FALSE;
  1.1034 +        }
  1.1035 +    }
  1.1036 +        U_ASSERT(FALSE);
  1.1037 +        
  1.1038 +    case START_LINE:
  1.1039 +    {
  1.1040 +        UChar32  c;
  1.1041 +        if (startPos == fAnchorStart) {
  1.1042 +            MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1.1043 +            if (U_FAILURE(fDeferredStatus)) {
  1.1044 +                return FALSE;
  1.1045 +            }
  1.1046 +            if (fMatch) {
  1.1047 +                return TRUE;
  1.1048 +            }
  1.1049 +            U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1.1050 +        }
  1.1051 +        
  1.1052 +        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
  1.1053 +            for (;;) {
  1.1054 +                c = inputBuf[startPos-1];
  1.1055 +                if (c == 0x0a) {
  1.1056 +                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1.1057 +                    if (U_FAILURE(fDeferredStatus)) {
  1.1058 +                        return FALSE;
  1.1059 +                    }
  1.1060 +                    if (fMatch) {
  1.1061 +                        return TRUE;
  1.1062 +                    }
  1.1063 +                }
  1.1064 +                if (startPos >= testLen) {
  1.1065 +                    fMatch = FALSE;
  1.1066 +                    fHitEnd = TRUE;
  1.1067 +                    return FALSE;
  1.1068 +                }
  1.1069 +                U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1.1070 +                // Note that it's perfectly OK for a pattern to have a zero-length
  1.1071 +                //   match at the end of a string, so we must make sure that the loop
  1.1072 +                //   runs with startPos == testLen the last time through.
  1.1073 +	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1.1074 +                    return FALSE;
  1.1075 +            }
  1.1076 +        } else {
  1.1077 +            for (;;) {
  1.1078 +                c = inputBuf[startPos-1];
  1.1079 +                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  1.1080 +                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
  1.1081 +                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
  1.1082 +                        startPos++;
  1.1083 +                    }
  1.1084 +                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
  1.1085 +                    if (U_FAILURE(fDeferredStatus)) {
  1.1086 +                        return FALSE;
  1.1087 +                    }
  1.1088 +                    if (fMatch) {
  1.1089 +                        return TRUE;
  1.1090 +                    }
  1.1091 +                }
  1.1092 +                if (startPos >= testLen) {
  1.1093 +                    fMatch = FALSE;
  1.1094 +                    fHitEnd = TRUE;
  1.1095 +                    return FALSE;
  1.1096 +                }
  1.1097 +                U16_FWD_1(inputBuf, startPos, fActiveLimit);
  1.1098 +                // Note that it's perfectly OK for a pattern to have a zero-length
  1.1099 +                //   match at the end of a string, so we must make sure that the loop
  1.1100 +                //   runs with startPos == testLen the last time through.
  1.1101 +	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
  1.1102 +                    return FALSE;
  1.1103 +            }
  1.1104 +        }
  1.1105 +    }
  1.1106 +        
  1.1107 +    default:
  1.1108 +        U_ASSERT(FALSE);
  1.1109 +    }
  1.1110 +    
  1.1111 +    U_ASSERT(FALSE);
  1.1112 +    return FALSE;
  1.1113 +}
  1.1114 +
  1.1115 +
  1.1116 +
  1.1117 +//--------------------------------------------------------------------------------
  1.1118 +//
  1.1119 +//  group()
  1.1120 +//
  1.1121 +//--------------------------------------------------------------------------------
  1.1122 +UnicodeString RegexMatcher::group(UErrorCode &status) const {
  1.1123 +    return group(0, status);
  1.1124 +}
  1.1125 +
  1.1126 +//  Return immutable shallow clone
  1.1127 +UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
  1.1128 +    return group(0, dest, group_len, status);
  1.1129 +}
  1.1130 +
  1.1131 +//  Return immutable shallow clone
  1.1132 +UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
  1.1133 +    group_len = 0;
  1.1134 +    UBool bailOut = FALSE;
  1.1135 +    if (U_FAILURE(status)) {
  1.1136 +        return dest;
  1.1137 +    }
  1.1138 +    if (U_FAILURE(fDeferredStatus)) {
  1.1139 +        status = fDeferredStatus;
  1.1140 +        bailOut = TRUE;
  1.1141 +    }
  1.1142 +    if (fMatch == FALSE) {
  1.1143 +        status = U_REGEX_INVALID_STATE;
  1.1144 +        bailOut = TRUE;
  1.1145 +    }
  1.1146 +    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1.1147 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1148 +        bailOut = TRUE;
  1.1149 +    }
  1.1150 +    
  1.1151 +    if (bailOut) {
  1.1152 +        return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
  1.1153 +    }
  1.1154 +    
  1.1155 +    int64_t s, e;
  1.1156 +    if (groupNum == 0) {
  1.1157 +        s = fMatchStart;
  1.1158 +        e = fMatchEnd;
  1.1159 +    } else {
  1.1160 +        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1.1161 +        U_ASSERT(groupOffset < fPattern->fFrameSize);
  1.1162 +        U_ASSERT(groupOffset >= 0);
  1.1163 +        s = fFrame->fExtra[groupOffset];
  1.1164 +        e = fFrame->fExtra[groupOffset+1];
  1.1165 +    }
  1.1166 +
  1.1167 +    if (s < 0) {
  1.1168 +        // A capture group wasn't part of the match
  1.1169 +        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
  1.1170 +    }
  1.1171 +    U_ASSERT(s <= e);
  1.1172 +    group_len = e - s;
  1.1173 +    
  1.1174 +    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
  1.1175 +    if (dest)
  1.1176 +        UTEXT_SETNATIVEINDEX(dest, s);
  1.1177 +    return dest;
  1.1178 +}
  1.1179 +
  1.1180 +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
  1.1181 +    UnicodeString result;
  1.1182 +    if (U_FAILURE(status)) {
  1.1183 +        return result;
  1.1184 +    }
  1.1185 +    UText resultText = UTEXT_INITIALIZER;
  1.1186 +    utext_openUnicodeString(&resultText, &result, &status);
  1.1187 +    group(groupNum, &resultText, status);
  1.1188 +    utext_close(&resultText);
  1.1189 +    return result;
  1.1190 +}
  1.1191 +
  1.1192 +
  1.1193 +//  Return deep (mutable) clone
  1.1194 +//		Technology Preview (as an API), but note that the UnicodeString API is implemented
  1.1195 +//		using this function.
  1.1196 +UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
  1.1197 +    UBool bailOut = FALSE;
  1.1198 +    if (U_FAILURE(status)) {
  1.1199 +        return dest;
  1.1200 +    }
  1.1201 +    if (U_FAILURE(fDeferredStatus)) {
  1.1202 +        status = fDeferredStatus;
  1.1203 +        bailOut = TRUE;
  1.1204 +    }
  1.1205 +    
  1.1206 +    if (fMatch == FALSE) {
  1.1207 +        status = U_REGEX_INVALID_STATE;
  1.1208 +        bailOut = TRUE;
  1.1209 +    }
  1.1210 +    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1.1211 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1212 +        bailOut = TRUE;
  1.1213 +    }
  1.1214 +    
  1.1215 +    if (bailOut) {
  1.1216 +        if (dest) {
  1.1217 +            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1.1218 +            return dest;
  1.1219 +        } else {
  1.1220 +            return utext_openUChars(NULL, NULL, 0, &status);
  1.1221 +        }
  1.1222 +    }
  1.1223 +    
  1.1224 +    int64_t s, e;
  1.1225 +    if (groupNum == 0) {
  1.1226 +        s = fMatchStart;
  1.1227 +        e = fMatchEnd;
  1.1228 +    } else {
  1.1229 +        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1.1230 +        U_ASSERT(groupOffset < fPattern->fFrameSize);
  1.1231 +        U_ASSERT(groupOffset >= 0);
  1.1232 +        s = fFrame->fExtra[groupOffset];
  1.1233 +        e = fFrame->fExtra[groupOffset+1];
  1.1234 +    }
  1.1235 +    
  1.1236 +    if (s < 0) {
  1.1237 +        // A capture group wasn't part of the match 
  1.1238 +        if (dest) {
  1.1239 +            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1.1240 +            return dest;
  1.1241 +        } else {
  1.1242 +            return utext_openUChars(NULL, NULL, 0, &status);
  1.1243 +        }
  1.1244 +    }
  1.1245 +    U_ASSERT(s <= e);
  1.1246 +    
  1.1247 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1248 +        U_ASSERT(e <= fInputLength);
  1.1249 +        if (dest) {
  1.1250 +            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
  1.1251 +        } else {
  1.1252 +            UText groupText = UTEXT_INITIALIZER;
  1.1253 +            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
  1.1254 +            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
  1.1255 +            utext_close(&groupText);
  1.1256 +        }
  1.1257 +    } else {
  1.1258 +        int32_t len16;
  1.1259 +        if (UTEXT_USES_U16(fInputText)) {
  1.1260 +            len16 = (int32_t)(e-s);
  1.1261 +        } else {
  1.1262 +            UErrorCode lengthStatus = U_ZERO_ERROR;
  1.1263 +            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
  1.1264 +        }
  1.1265 +        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
  1.1266 +        if (groupChars == NULL) {
  1.1267 +            status = U_MEMORY_ALLOCATION_ERROR;
  1.1268 +            return dest;
  1.1269 +        }
  1.1270 +        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
  1.1271 +
  1.1272 +        if (dest) {
  1.1273 +            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
  1.1274 +        } else {
  1.1275 +            UText groupText = UTEXT_INITIALIZER;
  1.1276 +            utext_openUChars(&groupText, groupChars, len16, &status);
  1.1277 +            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
  1.1278 +            utext_close(&groupText);
  1.1279 +        }
  1.1280 +        
  1.1281 +        uprv_free(groupChars);
  1.1282 +    }
  1.1283 +    return dest;
  1.1284 +}
  1.1285 +
  1.1286 +//--------------------------------------------------------------------------------
  1.1287 +//
  1.1288 +//  appendGroup() -- currently internal only, appends a group to a UText rather
  1.1289 +//                   than replacing its contents
  1.1290 +//
  1.1291 +//--------------------------------------------------------------------------------
  1.1292 +
  1.1293 +int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
  1.1294 +    if (U_FAILURE(status)) {
  1.1295 +        return 0;
  1.1296 +    }
  1.1297 +    if (U_FAILURE(fDeferredStatus)) {
  1.1298 +        status = fDeferredStatus;
  1.1299 +        return 0;
  1.1300 +    }
  1.1301 +    int64_t destLen = utext_nativeLength(dest);
  1.1302 +    
  1.1303 +    if (fMatch == FALSE) {
  1.1304 +        status = U_REGEX_INVALID_STATE;
  1.1305 +        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1.1306 +    }
  1.1307 +    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
  1.1308 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1309 +        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1.1310 +    }
  1.1311 +    
  1.1312 +    int64_t s, e;
  1.1313 +    if (groupNum == 0) {
  1.1314 +        s = fMatchStart;
  1.1315 +        e = fMatchEnd;
  1.1316 +    } else {
  1.1317 +        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
  1.1318 +        U_ASSERT(groupOffset < fPattern->fFrameSize);
  1.1319 +        U_ASSERT(groupOffset >= 0);
  1.1320 +        s = fFrame->fExtra[groupOffset];
  1.1321 +        e = fFrame->fExtra[groupOffset+1];
  1.1322 +    }
  1.1323 +    
  1.1324 +    if (s < 0) {
  1.1325 +        // A capture group wasn't part of the match 
  1.1326 +        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
  1.1327 +    }
  1.1328 +    U_ASSERT(s <= e);
  1.1329 +    
  1.1330 +    int64_t deltaLen;
  1.1331 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1332 +        U_ASSERT(e <= fInputLength);
  1.1333 +        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
  1.1334 +    } else {
  1.1335 +        int32_t len16;
  1.1336 +        if (UTEXT_USES_U16(fInputText)) {
  1.1337 +            len16 = (int32_t)(e-s);
  1.1338 +        } else {
  1.1339 +            UErrorCode lengthStatus = U_ZERO_ERROR;
  1.1340 +            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
  1.1341 +        }
  1.1342 +        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
  1.1343 +        if (groupChars == NULL) {
  1.1344 +            status = U_MEMORY_ALLOCATION_ERROR;
  1.1345 +            return 0;
  1.1346 +        }
  1.1347 +        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
  1.1348 +    
  1.1349 +        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
  1.1350 +        uprv_free(groupChars);
  1.1351 +    }
  1.1352 +    return deltaLen;
  1.1353 +}
  1.1354 +
  1.1355 +
  1.1356 +
  1.1357 +//--------------------------------------------------------------------------------
  1.1358 +//
  1.1359 +//  groupCount()
  1.1360 +//
  1.1361 +//--------------------------------------------------------------------------------
  1.1362 +int32_t RegexMatcher::groupCount() const {
  1.1363 +    return fPattern->fGroupMap->size();
  1.1364 +}
  1.1365 +
  1.1366 +
  1.1367 +
  1.1368 +//--------------------------------------------------------------------------------
  1.1369 +//
  1.1370 +//  hasAnchoringBounds()
  1.1371 +//
  1.1372 +//--------------------------------------------------------------------------------
  1.1373 +UBool RegexMatcher::hasAnchoringBounds() const {
  1.1374 +    return fAnchoringBounds;
  1.1375 +}
  1.1376 +
  1.1377 +
  1.1378 +//--------------------------------------------------------------------------------
  1.1379 +//
  1.1380 +//  hasTransparentBounds()
  1.1381 +//
  1.1382 +//--------------------------------------------------------------------------------
  1.1383 +UBool RegexMatcher::hasTransparentBounds() const {
  1.1384 +    return fTransparentBounds;
  1.1385 +}
  1.1386 +
  1.1387 +
  1.1388 +
  1.1389 +//--------------------------------------------------------------------------------
  1.1390 +//
  1.1391 +//  hitEnd()
  1.1392 +//
  1.1393 +//--------------------------------------------------------------------------------
  1.1394 +UBool RegexMatcher::hitEnd() const {
  1.1395 +    return fHitEnd;
  1.1396 +}
  1.1397 +
  1.1398 +
  1.1399 +//--------------------------------------------------------------------------------
  1.1400 +//
  1.1401 +//  input()
  1.1402 +//
  1.1403 +//--------------------------------------------------------------------------------
  1.1404 +const UnicodeString &RegexMatcher::input() const {
  1.1405 +    if (!fInput) {
  1.1406 +        UErrorCode status = U_ZERO_ERROR;
  1.1407 +        int32_t len16;
  1.1408 +        if (UTEXT_USES_U16(fInputText)) {
  1.1409 +            len16 = (int32_t)fInputLength;
  1.1410 +        } else {
  1.1411 +            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
  1.1412 +            status = U_ZERO_ERROR; // overflow, length status
  1.1413 +        }
  1.1414 +        UnicodeString *result = new UnicodeString(len16, 0, 0);
  1.1415 +        
  1.1416 +        UChar *inputChars = result->getBuffer(len16);
  1.1417 +        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
  1.1418 +        result->releaseBuffer(len16);
  1.1419 +        
  1.1420 +        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
  1.1421 +    }
  1.1422 +    
  1.1423 +    return *fInput;
  1.1424 +}
  1.1425 +
  1.1426 +//--------------------------------------------------------------------------------
  1.1427 +//
  1.1428 +//  inputText()
  1.1429 +//
  1.1430 +//--------------------------------------------------------------------------------
  1.1431 +UText *RegexMatcher::inputText() const {
  1.1432 +    return fInputText;
  1.1433 +}
  1.1434 +
  1.1435 +
  1.1436 +//--------------------------------------------------------------------------------
  1.1437 +//
  1.1438 +//  getInput() -- like inputText(), but makes a clone or copies into another UText
  1.1439 +//
  1.1440 +//--------------------------------------------------------------------------------
  1.1441 +UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
  1.1442 +    UBool bailOut = FALSE;
  1.1443 +    if (U_FAILURE(status)) {
  1.1444 +        return dest;
  1.1445 +    }
  1.1446 +    if (U_FAILURE(fDeferredStatus)) {
  1.1447 +        status = fDeferredStatus;
  1.1448 +        bailOut = TRUE;
  1.1449 +    }
  1.1450 +    
  1.1451 +    if (bailOut) {
  1.1452 +        if (dest) {
  1.1453 +            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
  1.1454 +            return dest;
  1.1455 +        } else {
  1.1456 +            return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
  1.1457 +        }
  1.1458 +    }
  1.1459 +    
  1.1460 +    if (dest) {
  1.1461 +        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1462 +            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
  1.1463 +        } else {
  1.1464 +            int32_t input16Len;
  1.1465 +            if (UTEXT_USES_U16(fInputText)) {
  1.1466 +                input16Len = (int32_t)fInputLength;
  1.1467 +            } else {
  1.1468 +                UErrorCode lengthStatus = U_ZERO_ERROR;
  1.1469 +                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
  1.1470 +            }
  1.1471 +            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
  1.1472 +            if (inputChars == NULL) {
  1.1473 +                return dest;
  1.1474 +            }
  1.1475 +            
  1.1476 +            status = U_ZERO_ERROR;
  1.1477 +            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
  1.1478 +            status = U_ZERO_ERROR;
  1.1479 +            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
  1.1480 +            
  1.1481 +            uprv_free(inputChars);
  1.1482 +        }
  1.1483 +        return dest;
  1.1484 +    } else {
  1.1485 +        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
  1.1486 +    }
  1.1487 +}
  1.1488 +
  1.1489 +
  1.1490 +static UBool compat_SyncMutableUTextContents(UText *ut);
  1.1491 +static UBool compat_SyncMutableUTextContents(UText *ut) {
  1.1492 +    UBool retVal = FALSE;
  1.1493 +    
  1.1494 +    //  In the following test, we're really only interested in whether the UText should switch
  1.1495 +    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
  1.1496 +    //  will still point to the correct data.
  1.1497 +    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
  1.1498 +        UnicodeString *us=(UnicodeString *)ut->context;
  1.1499 +    
  1.1500 +        // Update to the latest length.
  1.1501 +        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
  1.1502 +        int32_t newLength = us->length();
  1.1503 +    
  1.1504 +        // Update the chunk description.
  1.1505 +        // The buffer may have switched between stack- and heap-based.
  1.1506 +        ut->chunkContents    = us->getBuffer();
  1.1507 +        ut->chunkLength      = newLength;
  1.1508 +        ut->chunkNativeLimit = newLength;
  1.1509 +        ut->nativeIndexingLimit = newLength;
  1.1510 +        retVal = TRUE;
  1.1511 +    }
  1.1512 +
  1.1513 +    return retVal;
  1.1514 +}
  1.1515 +
  1.1516 +//--------------------------------------------------------------------------------
  1.1517 +//
  1.1518 +//  lookingAt()
  1.1519 +//
  1.1520 +//--------------------------------------------------------------------------------
  1.1521 +UBool RegexMatcher::lookingAt(UErrorCode &status) {
  1.1522 +    if (U_FAILURE(status)) {
  1.1523 +        return FALSE;
  1.1524 +    }
  1.1525 +    if (U_FAILURE(fDeferredStatus)) {
  1.1526 +        status = fDeferredStatus;
  1.1527 +        return FALSE;
  1.1528 +    }
  1.1529 +    
  1.1530 +    if (fInputUniStrMaybeMutable) {
  1.1531 +        if (compat_SyncMutableUTextContents(fInputText)) {
  1.1532 +        fInputLength = utext_nativeLength(fInputText);
  1.1533 +        reset();
  1.1534 +        }
  1.1535 +    }
  1.1536 +    else {
  1.1537 +        resetPreserveRegion();
  1.1538 +    }
  1.1539 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1540 +        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
  1.1541 +    } else {
  1.1542 +        MatchAt(fActiveStart, FALSE, status);
  1.1543 +    }
  1.1544 +    return fMatch;
  1.1545 +}
  1.1546 +
  1.1547 +
  1.1548 +UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
  1.1549 +    if (U_FAILURE(status)) {
  1.1550 +        return FALSE;
  1.1551 +    }
  1.1552 +    if (U_FAILURE(fDeferredStatus)) {
  1.1553 +        status = fDeferredStatus;
  1.1554 +        return FALSE;
  1.1555 +    }
  1.1556 +    reset();
  1.1557 +    
  1.1558 +    if (start < 0) {
  1.1559 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1560 +        return FALSE;
  1.1561 +    }
  1.1562 +    
  1.1563 +    if (fInputUniStrMaybeMutable) {
  1.1564 +        if (compat_SyncMutableUTextContents(fInputText)) {
  1.1565 +        fInputLength = utext_nativeLength(fInputText);
  1.1566 +        reset();
  1.1567 +        }
  1.1568 +    }
  1.1569 +
  1.1570 +    int64_t nativeStart;
  1.1571 +    nativeStart = start;
  1.1572 +    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
  1.1573 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1574 +        return FALSE;
  1.1575 +    }
  1.1576 +    
  1.1577 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1578 +        MatchChunkAt((int32_t)nativeStart, FALSE, status);
  1.1579 +    } else {
  1.1580 +        MatchAt(nativeStart, FALSE, status);
  1.1581 +    }
  1.1582 +    return fMatch;
  1.1583 +}
  1.1584 +
  1.1585 +
  1.1586 +
  1.1587 +//--------------------------------------------------------------------------------
  1.1588 +//
  1.1589 +//  matches()
  1.1590 +//
  1.1591 +//--------------------------------------------------------------------------------
  1.1592 +UBool RegexMatcher::matches(UErrorCode &status) {
  1.1593 +    if (U_FAILURE(status)) {
  1.1594 +        return FALSE;
  1.1595 +    }
  1.1596 +    if (U_FAILURE(fDeferredStatus)) {
  1.1597 +        status = fDeferredStatus;
  1.1598 +        return FALSE;
  1.1599 +    }
  1.1600 +
  1.1601 +    if (fInputUniStrMaybeMutable) {
  1.1602 +        if (compat_SyncMutableUTextContents(fInputText)) {
  1.1603 +        fInputLength = utext_nativeLength(fInputText);
  1.1604 +        reset();
  1.1605 +        }
  1.1606 +    }
  1.1607 +    else {
  1.1608 +        resetPreserveRegion();
  1.1609 +    }
  1.1610 +
  1.1611 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1612 +        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
  1.1613 +    } else {
  1.1614 +        MatchAt(fActiveStart, TRUE, status);
  1.1615 +    }
  1.1616 +    return fMatch;
  1.1617 +}
  1.1618 +
  1.1619 +
  1.1620 +UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
  1.1621 +    if (U_FAILURE(status)) {
  1.1622 +        return FALSE;
  1.1623 +    }
  1.1624 +    if (U_FAILURE(fDeferredStatus)) {
  1.1625 +        status = fDeferredStatus;
  1.1626 +        return FALSE;
  1.1627 +    }
  1.1628 +    reset();
  1.1629 +    
  1.1630 +    if (start < 0) {
  1.1631 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1632 +        return FALSE;
  1.1633 +    }
  1.1634 +
  1.1635 +    if (fInputUniStrMaybeMutable) {
  1.1636 +        if (compat_SyncMutableUTextContents(fInputText)) {
  1.1637 +        fInputLength = utext_nativeLength(fInputText);
  1.1638 +        reset();
  1.1639 +        }
  1.1640 +    }
  1.1641 +
  1.1642 +    int64_t nativeStart;
  1.1643 +    nativeStart = start;
  1.1644 +    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
  1.1645 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1646 +        return FALSE;
  1.1647 +    }
  1.1648 +
  1.1649 +    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
  1.1650 +        MatchChunkAt((int32_t)nativeStart, TRUE, status);
  1.1651 +    } else {
  1.1652 +        MatchAt(nativeStart, TRUE, status);
  1.1653 +    }
  1.1654 +    return fMatch;
  1.1655 +}
  1.1656 +
  1.1657 +
  1.1658 +
  1.1659 +//--------------------------------------------------------------------------------
  1.1660 +//
  1.1661 +//    pattern
  1.1662 +//
  1.1663 +//--------------------------------------------------------------------------------
  1.1664 +const RegexPattern &RegexMatcher::pattern() const {
  1.1665 +    return *fPattern;
  1.1666 +}
  1.1667 +
  1.1668 +
  1.1669 +
  1.1670 +//--------------------------------------------------------------------------------
  1.1671 +//
  1.1672 +//    region
  1.1673 +//
  1.1674 +//--------------------------------------------------------------------------------
  1.1675 +RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
  1.1676 +    if (U_FAILURE(status)) {
  1.1677 +        return *this;
  1.1678 +    }
  1.1679 +    
  1.1680 +    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
  1.1681 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1682 +    }
  1.1683 +        
  1.1684 +    int64_t nativeStart = regionStart;
  1.1685 +    int64_t nativeLimit = regionLimit;
  1.1686 +    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
  1.1687 +      status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1688 +    }
  1.1689 +
  1.1690 +    if (startIndex == -1)
  1.1691 +      this->reset();
  1.1692 +    else
  1.1693 +      resetPreserveRegion();    
  1.1694 +    
  1.1695 +    fRegionStart = nativeStart;
  1.1696 +    fRegionLimit = nativeLimit;
  1.1697 +    fActiveStart = nativeStart;
  1.1698 +    fActiveLimit = nativeLimit;
  1.1699 +
  1.1700 +    if (startIndex != -1) {
  1.1701 +      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
  1.1702 +          status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1703 +      }
  1.1704 +      fMatchEnd = startIndex;  
  1.1705 +    }
  1.1706 +
  1.1707 +    if (!fTransparentBounds) {
  1.1708 +        fLookStart = nativeStart;
  1.1709 +        fLookLimit = nativeLimit;
  1.1710 +    }
  1.1711 +    if (fAnchoringBounds) {
  1.1712 +        fAnchorStart = nativeStart;
  1.1713 +        fAnchorLimit = nativeLimit;
  1.1714 +    }
  1.1715 +    return *this;
  1.1716 +}
  1.1717 +
  1.1718 +RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
  1.1719 +  return region(start, limit, -1, status);
  1.1720 +}
  1.1721 +
  1.1722 +//--------------------------------------------------------------------------------
  1.1723 +//
  1.1724 +//    regionEnd
  1.1725 +//
  1.1726 +//--------------------------------------------------------------------------------
  1.1727 +int32_t RegexMatcher::regionEnd() const {
  1.1728 +    return (int32_t)fRegionLimit;
  1.1729 +}
  1.1730 +
  1.1731 +int64_t RegexMatcher::regionEnd64() const {
  1.1732 +    return fRegionLimit;
  1.1733 +}
  1.1734 +
  1.1735 +//--------------------------------------------------------------------------------
  1.1736 +//
  1.1737 +//    regionStart
  1.1738 +//
  1.1739 +//--------------------------------------------------------------------------------
  1.1740 +int32_t RegexMatcher::regionStart() const {
  1.1741 +    return (int32_t)fRegionStart;
  1.1742 +}
  1.1743 +
  1.1744 +int64_t RegexMatcher::regionStart64() const {
  1.1745 +    return fRegionStart;
  1.1746 +}
  1.1747 +
  1.1748 +
  1.1749 +//--------------------------------------------------------------------------------
  1.1750 +//
  1.1751 +//    replaceAll
  1.1752 +//
  1.1753 +//--------------------------------------------------------------------------------
  1.1754 +UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
  1.1755 +    UText replacementText = UTEXT_INITIALIZER;
  1.1756 +    UText resultText = UTEXT_INITIALIZER;
  1.1757 +    UnicodeString resultString;
  1.1758 +    if (U_FAILURE(status)) {
  1.1759 +        return resultString;
  1.1760 +    }
  1.1761 +    
  1.1762 +    utext_openConstUnicodeString(&replacementText, &replacement, &status);
  1.1763 +    utext_openUnicodeString(&resultText, &resultString, &status);
  1.1764 +        
  1.1765 +    replaceAll(&replacementText, &resultText, status);
  1.1766 +
  1.1767 +    utext_close(&resultText);
  1.1768 +    utext_close(&replacementText);
  1.1769 +    
  1.1770 +    return resultString;
  1.1771 +}
  1.1772 +
  1.1773 +
  1.1774 +//
  1.1775 +//    replaceAll, UText mode
  1.1776 +//
  1.1777 +UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
  1.1778 +    if (U_FAILURE(status)) {
  1.1779 +        return dest;
  1.1780 +    }
  1.1781 +    if (U_FAILURE(fDeferredStatus)) {
  1.1782 +        status = fDeferredStatus;
  1.1783 +        return dest;
  1.1784 +    }
  1.1785 +    
  1.1786 +    if (dest == NULL) {
  1.1787 +        UnicodeString emptyString;
  1.1788 +        UText empty = UTEXT_INITIALIZER;
  1.1789 +        
  1.1790 +        utext_openUnicodeString(&empty, &emptyString, &status);
  1.1791 +        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
  1.1792 +        utext_close(&empty);
  1.1793 +    }
  1.1794 +
  1.1795 +    if (U_SUCCESS(status)) {
  1.1796 +        reset();
  1.1797 +        while (find()) {
  1.1798 +            appendReplacement(dest, replacement, status);
  1.1799 +            if (U_FAILURE(status)) {
  1.1800 +                break;
  1.1801 +            }
  1.1802 +        }
  1.1803 +        appendTail(dest, status);
  1.1804 +    }
  1.1805 +    
  1.1806 +    return dest;
  1.1807 +}
  1.1808 +
  1.1809 +
  1.1810 +//--------------------------------------------------------------------------------
  1.1811 +//
  1.1812 +//    replaceFirst
  1.1813 +//
  1.1814 +//--------------------------------------------------------------------------------
  1.1815 +UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
  1.1816 +    UText replacementText = UTEXT_INITIALIZER;
  1.1817 +    UText resultText = UTEXT_INITIALIZER;
  1.1818 +    UnicodeString resultString;
  1.1819 +    
  1.1820 +    utext_openConstUnicodeString(&replacementText, &replacement, &status);
  1.1821 +    utext_openUnicodeString(&resultText, &resultString, &status);
  1.1822 +    
  1.1823 +    replaceFirst(&replacementText, &resultText, status);
  1.1824 +    
  1.1825 +    utext_close(&resultText);
  1.1826 +    utext_close(&replacementText);
  1.1827 +    
  1.1828 +    return resultString;
  1.1829 +}
  1.1830 +
  1.1831 +//
  1.1832 +//    replaceFirst, UText mode
  1.1833 +//
  1.1834 +UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
  1.1835 +    if (U_FAILURE(status)) {
  1.1836 +        return dest;
  1.1837 +    }
  1.1838 +    if (U_FAILURE(fDeferredStatus)) {
  1.1839 +        status = fDeferredStatus;
  1.1840 +        return dest;
  1.1841 +    }
  1.1842 +
  1.1843 +    reset();
  1.1844 +    if (!find()) {
  1.1845 +        return getInput(dest, status);
  1.1846 +    }
  1.1847 +    
  1.1848 +    if (dest == NULL) {
  1.1849 +        UnicodeString emptyString;
  1.1850 +        UText empty = UTEXT_INITIALIZER;
  1.1851 +        
  1.1852 +        utext_openUnicodeString(&empty, &emptyString, &status);
  1.1853 +        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
  1.1854 +        utext_close(&empty);
  1.1855 +    }
  1.1856 +    
  1.1857 +    appendReplacement(dest, replacement, status);
  1.1858 +    appendTail(dest, status);
  1.1859 +    
  1.1860 +    return dest;
  1.1861 +}
  1.1862 +
  1.1863 +
  1.1864 +//--------------------------------------------------------------------------------
  1.1865 +//
  1.1866 +//     requireEnd
  1.1867 +//
  1.1868 +//--------------------------------------------------------------------------------
  1.1869 +UBool RegexMatcher::requireEnd() const {
  1.1870 +    return fRequireEnd;
  1.1871 +}
  1.1872 +
  1.1873 +
  1.1874 +//--------------------------------------------------------------------------------
  1.1875 +//
  1.1876 +//     reset
  1.1877 +//
  1.1878 +//--------------------------------------------------------------------------------
  1.1879 +RegexMatcher &RegexMatcher::reset() {
  1.1880 +    fRegionStart    = 0;
  1.1881 +    fRegionLimit    = fInputLength;
  1.1882 +    fActiveStart    = 0;
  1.1883 +    fActiveLimit    = fInputLength;
  1.1884 +    fAnchorStart    = 0;
  1.1885 +    fAnchorLimit    = fInputLength;
  1.1886 +    fLookStart      = 0;
  1.1887 +    fLookLimit      = fInputLength;
  1.1888 +    resetPreserveRegion();
  1.1889 +    return *this;
  1.1890 +}
  1.1891 +
  1.1892 +
  1.1893 +
  1.1894 +void RegexMatcher::resetPreserveRegion() {
  1.1895 +    fMatchStart     = 0;
  1.1896 +    fMatchEnd       = 0;
  1.1897 +    fLastMatchEnd   = -1;
  1.1898 +    fAppendPosition = 0;
  1.1899 +    fMatch          = FALSE;
  1.1900 +    fHitEnd         = FALSE;
  1.1901 +    fRequireEnd     = FALSE;
  1.1902 +    fTime           = 0;
  1.1903 +    fTickCounter    = TIMER_INITIAL_VALUE;
  1.1904 +    //resetStack(); // more expensive than it looks...
  1.1905 +}
  1.1906 +
  1.1907 +
  1.1908 +RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
  1.1909 +    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
  1.1910 +    if (fPattern->fNeedsAltInput) {
  1.1911 +        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
  1.1912 +    }
  1.1913 +    fInputLength = utext_nativeLength(fInputText);
  1.1914 +    
  1.1915 +    reset();
  1.1916 +    delete fInput;
  1.1917 +    fInput = NULL;
  1.1918 +
  1.1919 +    //  Do the following for any UnicodeString.
  1.1920 +    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
  1.1921 +    fInputUniStrMaybeMutable = TRUE;    
  1.1922 +    
  1.1923 +    if (fWordBreakItr != NULL) {
  1.1924 +#if UCONFIG_NO_BREAK_ITERATION==0
  1.1925 +        UErrorCode status = U_ZERO_ERROR;
  1.1926 +        fWordBreakItr->setText(fInputText, status);
  1.1927 +#endif
  1.1928 +    }
  1.1929 +    return *this;
  1.1930 +}
  1.1931 +
  1.1932 +
  1.1933 +RegexMatcher &RegexMatcher::reset(UText *input) {
  1.1934 +    if (fInputText != input) {
  1.1935 +        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
  1.1936 +        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
  1.1937 +        fInputLength = utext_nativeLength(fInputText);
  1.1938 +    
  1.1939 +        delete fInput;
  1.1940 +        fInput = NULL;
  1.1941 +        
  1.1942 +        if (fWordBreakItr != NULL) {
  1.1943 +#if UCONFIG_NO_BREAK_ITERATION==0
  1.1944 +            UErrorCode status = U_ZERO_ERROR;
  1.1945 +            fWordBreakItr->setText(input, status);
  1.1946 +#endif
  1.1947 +        }
  1.1948 +    }
  1.1949 +    reset();
  1.1950 +    fInputUniStrMaybeMutable = FALSE;
  1.1951 +
  1.1952 +    return *this;
  1.1953 +}
  1.1954 +
  1.1955 +/*RegexMatcher &RegexMatcher::reset(const UChar *) {
  1.1956 +    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
  1.1957 +    return *this;
  1.1958 +}*/
  1.1959 +
  1.1960 +RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
  1.1961 +    if (U_FAILURE(status)) {
  1.1962 +        return *this;
  1.1963 +    }
  1.1964 +    reset();       // Reset also resets the region to be the entire string.
  1.1965 +    
  1.1966 +    if (position < 0 || position > fActiveLimit) {
  1.1967 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.1968 +        return *this;
  1.1969 +    }
  1.1970 +    fMatchEnd = position;
  1.1971 +    return *this;
  1.1972 +}
  1.1973 +
  1.1974 +
  1.1975 +//--------------------------------------------------------------------------------
  1.1976 +//
  1.1977 +//    refresh
  1.1978 +//
  1.1979 +//--------------------------------------------------------------------------------
  1.1980 +RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
  1.1981 +    if (U_FAILURE(status)) {
  1.1982 +        return *this;
  1.1983 +    }
  1.1984 +    if (input == NULL) {
  1.1985 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1986 +        return *this;
  1.1987 +    }
  1.1988 +    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
  1.1989 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1990 +        return *this;
  1.1991 +    }
  1.1992 +    int64_t  pos = utext_getNativeIndex(fInputText);
  1.1993 +    //  Shallow read-only clone of the new UText into the existing input UText
  1.1994 +    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
  1.1995 +    if (U_FAILURE(status)) {
  1.1996 +        return *this;
  1.1997 +    }
  1.1998 +    utext_setNativeIndex(fInputText, pos);
  1.1999 +
  1.2000 +    if (fAltInputText != NULL) {
  1.2001 +        pos = utext_getNativeIndex(fAltInputText);
  1.2002 +        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
  1.2003 +        if (U_FAILURE(status)) {
  1.2004 +            return *this;
  1.2005 +        }
  1.2006 +        utext_setNativeIndex(fAltInputText, pos);
  1.2007 +    }
  1.2008 +    return *this;
  1.2009 +}
  1.2010 +
  1.2011 +
  1.2012 +
  1.2013 +//--------------------------------------------------------------------------------
  1.2014 +//
  1.2015 +//    setTrace
  1.2016 +//
  1.2017 +//--------------------------------------------------------------------------------
  1.2018 +void RegexMatcher::setTrace(UBool state) {
  1.2019 +    fTraceDebug = state;
  1.2020 +}
  1.2021 +
  1.2022 +
  1.2023 +
  1.2024 +//---------------------------------------------------------------------
  1.2025 +//
  1.2026 +//   split
  1.2027 +//
  1.2028 +//---------------------------------------------------------------------
  1.2029 +int32_t  RegexMatcher::split(const UnicodeString &input,
  1.2030 +        UnicodeString    dest[],
  1.2031 +        int32_t          destCapacity,
  1.2032 +        UErrorCode      &status)
  1.2033 +{
  1.2034 +    UText inputText = UTEXT_INITIALIZER;
  1.2035 +    utext_openConstUnicodeString(&inputText, &input, &status);
  1.2036 +    if (U_FAILURE(status)) {
  1.2037 +        return 0;
  1.2038 +    }
  1.2039 +
  1.2040 +    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
  1.2041 +    if (destText == NULL) {
  1.2042 +        status = U_MEMORY_ALLOCATION_ERROR;
  1.2043 +        return 0;
  1.2044 +    }
  1.2045 +    int32_t i;
  1.2046 +    for (i = 0; i < destCapacity; i++) {
  1.2047 +        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
  1.2048 +    }
  1.2049 +    
  1.2050 +    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
  1.2051 +    
  1.2052 +    for (i = 0; i < destCapacity; i++) {
  1.2053 +        utext_close(destText[i]);
  1.2054 +    }
  1.2055 +
  1.2056 +    uprv_free(destText);
  1.2057 +    utext_close(&inputText);
  1.2058 +    return fieldCount;
  1.2059 +}
  1.2060 +
  1.2061 +//
  1.2062 +//   split, UText mode
  1.2063 +//
  1.2064 +int32_t  RegexMatcher::split(UText *input,
  1.2065 +        UText           *dest[],
  1.2066 +        int32_t          destCapacity,
  1.2067 +        UErrorCode      &status)
  1.2068 +{
  1.2069 +    //
  1.2070 +    // Check arguements for validity
  1.2071 +    //
  1.2072 +    if (U_FAILURE(status)) {
  1.2073 +        return 0;
  1.2074 +    };
  1.2075 +
  1.2076 +    if (destCapacity < 1) {
  1.2077 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2078 +        return 0;
  1.2079 +    }
  1.2080 +
  1.2081 +    //
  1.2082 +    // Reset for the input text
  1.2083 +    //
  1.2084 +    reset(input);
  1.2085 +    int64_t   nextOutputStringStart = 0;
  1.2086 +    if (fActiveLimit == 0) {
  1.2087 +        return 0;
  1.2088 +    }
  1.2089 +
  1.2090 +    //
  1.2091 +    // Loop through the input text, searching for the delimiter pattern
  1.2092 +    //
  1.2093 +    int32_t i;
  1.2094 +    int32_t numCaptureGroups = fPattern->fGroupMap->size();
  1.2095 +    for (i=0; ; i++) {
  1.2096 +        if (i>=destCapacity-1) {
  1.2097 +            // There is one or zero output string left.
  1.2098 +            // Fill the last output string with whatever is left from the input, then exit the loop.
  1.2099 +            //  ( i will be == destCapacity if we filled the output array while processing
  1.2100 +            //    capture groups of the delimiter expression, in which case we will discard the
  1.2101 +            //    last capture group saved in favor of the unprocessed remainder of the
  1.2102 +            //    input string.)
  1.2103 +            i = destCapacity-1;
  1.2104 +            if (fActiveLimit > nextOutputStringStart) {
  1.2105 +                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  1.2106 +                    if (dest[i]) {
  1.2107 +                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  1.2108 +                                      input->chunkContents+nextOutputStringStart, 
  1.2109 +                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
  1.2110 +                    } else {
  1.2111 +                        UText remainingText = UTEXT_INITIALIZER;
  1.2112 +                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  1.2113 +                                         fActiveLimit-nextOutputStringStart, &status);
  1.2114 +                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2115 +                        utext_close(&remainingText);
  1.2116 +                    }
  1.2117 +                } else {
  1.2118 +                    UErrorCode lengthStatus = U_ZERO_ERROR;
  1.2119 +                    int32_t remaining16Length = 
  1.2120 +                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
  1.2121 +                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  1.2122 +                    if (remainingChars == NULL) {
  1.2123 +                        status = U_MEMORY_ALLOCATION_ERROR;
  1.2124 +                        break;
  1.2125 +                    }
  1.2126 +
  1.2127 +                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
  1.2128 +                    if (dest[i]) {
  1.2129 +                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  1.2130 +                    } else {
  1.2131 +                        UText remainingText = UTEXT_INITIALIZER;
  1.2132 +                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  1.2133 +                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2134 +                        utext_close(&remainingText);
  1.2135 +                    }
  1.2136 +                    
  1.2137 +                    uprv_free(remainingChars);
  1.2138 +                }
  1.2139 +            }
  1.2140 +            break;
  1.2141 +        }
  1.2142 +        if (find()) {
  1.2143 +            // We found another delimiter.  Move everything from where we started looking
  1.2144 +            //  up until the start of the delimiter into the next output string.
  1.2145 +            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  1.2146 +                if (dest[i]) {
  1.2147 +                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  1.2148 +                                  input->chunkContents+nextOutputStringStart, 
  1.2149 +                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
  1.2150 +                } else {
  1.2151 +                    UText remainingText = UTEXT_INITIALIZER;
  1.2152 +                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  1.2153 +                                      fMatchStart-nextOutputStringStart, &status);
  1.2154 +                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2155 +                    utext_close(&remainingText);
  1.2156 +                }
  1.2157 +            } else {
  1.2158 +                UErrorCode lengthStatus = U_ZERO_ERROR;
  1.2159 +                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
  1.2160 +                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  1.2161 +                if (remainingChars == NULL) {
  1.2162 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.2163 +                    break;
  1.2164 +                }
  1.2165 +                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
  1.2166 +                if (dest[i]) {
  1.2167 +                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  1.2168 +                } else {
  1.2169 +                    UText remainingText = UTEXT_INITIALIZER;
  1.2170 +                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  1.2171 +                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2172 +                    utext_close(&remainingText);
  1.2173 +                }
  1.2174 +                
  1.2175 +                uprv_free(remainingChars);
  1.2176 +            }
  1.2177 +            nextOutputStringStart = fMatchEnd;
  1.2178 +
  1.2179 +            // If the delimiter pattern has capturing parentheses, the captured
  1.2180 +            //  text goes out into the next n destination strings.
  1.2181 +            int32_t groupNum;
  1.2182 +            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
  1.2183 +                if (i >= destCapacity-2) {
  1.2184 +                    // Never fill the last available output string with capture group text.
  1.2185 +                    // It will filled with the last field, the remainder of the
  1.2186 +                    //  unsplit input text.
  1.2187 +                    break;
  1.2188 +                }
  1.2189 +                i++;
  1.2190 +                dest[i] = group(groupNum, dest[i], status);
  1.2191 +            }
  1.2192 +
  1.2193 +            if (nextOutputStringStart == fActiveLimit) {
  1.2194 +                // The delimiter was at the end of the string.  We're done, but first
  1.2195 +                // we output one last empty string, for the empty field following
  1.2196 +                //   the delimiter at the end of input.
  1.2197 +                if (i+1 < destCapacity) {
  1.2198 +                    ++i;
  1.2199 +                    if (dest[i] == NULL) {
  1.2200 +                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
  1.2201 +                    } else {
  1.2202 +                        static UChar emptyString[] = {(UChar)0};
  1.2203 +                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
  1.2204 +                    }
  1.2205 +                }
  1.2206 +                break;
  1.2207 +            
  1.2208 +            } 
  1.2209 +        }
  1.2210 +        else
  1.2211 +        {
  1.2212 +            // We ran off the end of the input while looking for the next delimiter.
  1.2213 +            // All the remaining text goes into the current output string.
  1.2214 +            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
  1.2215 +                if (dest[i]) {
  1.2216 +                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 
  1.2217 +                                  input->chunkContents+nextOutputStringStart, 
  1.2218 +                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
  1.2219 +                } else {
  1.2220 +                    UText remainingText = UTEXT_INITIALIZER;
  1.2221 +                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 
  1.2222 +                                     fActiveLimit-nextOutputStringStart, &status);
  1.2223 +                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2224 +                    utext_close(&remainingText);
  1.2225 +                }
  1.2226 +            } else {
  1.2227 +                UErrorCode lengthStatus = U_ZERO_ERROR;
  1.2228 +                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
  1.2229 +                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
  1.2230 +                if (remainingChars == NULL) {
  1.2231 +                    status = U_MEMORY_ALLOCATION_ERROR;
  1.2232 +                    break;
  1.2233 +                }
  1.2234 +                
  1.2235 +                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
  1.2236 +                if (dest[i]) {
  1.2237 +                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
  1.2238 +                } else {
  1.2239 +                    UText remainingText = UTEXT_INITIALIZER;
  1.2240 +                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
  1.2241 +                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
  1.2242 +                    utext_close(&remainingText);
  1.2243 +                }
  1.2244 +                
  1.2245 +                uprv_free(remainingChars);
  1.2246 +            }
  1.2247 +            break;
  1.2248 +        }
  1.2249 +        if (U_FAILURE(status)) {
  1.2250 +            break;
  1.2251 +        }
  1.2252 +    }   // end of for loop
  1.2253 +    return i+1;
  1.2254 +}
  1.2255 +
  1.2256 +
  1.2257 +//--------------------------------------------------------------------------------
  1.2258 +//
  1.2259 +//     start
  1.2260 +//
  1.2261 +//--------------------------------------------------------------------------------
  1.2262 +int32_t RegexMatcher::start(UErrorCode &status) const {
  1.2263 +    return start(0, status);
  1.2264 +}
  1.2265 +
  1.2266 +int64_t RegexMatcher::start64(UErrorCode &status) const {
  1.2267 +    return start64(0, status);
  1.2268 +}
  1.2269 +
  1.2270 +//--------------------------------------------------------------------------------
  1.2271 +//
  1.2272 +//     start(int32_t group, UErrorCode &status)
  1.2273 +//
  1.2274 +//--------------------------------------------------------------------------------
  1.2275 +
  1.2276 +int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
  1.2277 +    if (U_FAILURE(status)) {
  1.2278 +        return -1;
  1.2279 +    }
  1.2280 +    if (U_FAILURE(fDeferredStatus)) {
  1.2281 +        status = fDeferredStatus;
  1.2282 +        return -1;
  1.2283 +    }
  1.2284 +    if (fMatch == FALSE) {
  1.2285 +        status = U_REGEX_INVALID_STATE;
  1.2286 +        return -1;
  1.2287 +    }
  1.2288 +    if (group < 0 || group > fPattern->fGroupMap->size()) {
  1.2289 +        status = U_INDEX_OUTOFBOUNDS_ERROR;
  1.2290 +        return -1;
  1.2291 +    }
  1.2292 +    int64_t s;
  1.2293 +    if (group == 0) {
  1.2294 +        s = fMatchStart; 
  1.2295 +    } else {
  1.2296 +        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
  1.2297 +        U_ASSERT(groupOffset < fPattern->fFrameSize);
  1.2298 +        U_ASSERT(groupOffset >= 0);
  1.2299 +        s = fFrame->fExtra[groupOffset];
  1.2300 +    }
  1.2301 +    
  1.2302 +    return s;
  1.2303 +}
  1.2304 +
  1.2305 +
  1.2306 +int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
  1.2307 +    return (int32_t)start64(group, status);
  1.2308 +}
  1.2309 +
  1.2310 +//--------------------------------------------------------------------------------
  1.2311 +//
  1.2312 +//     useAnchoringBounds
  1.2313 +//
  1.2314 +//--------------------------------------------------------------------------------
  1.2315 +RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
  1.2316 +    fAnchoringBounds = b;
  1.2317 +    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
  1.2318 +    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
  1.2319 +    return *this;
  1.2320 +}
  1.2321 +
  1.2322 +
  1.2323 +//--------------------------------------------------------------------------------
  1.2324 +//
  1.2325 +//     useTransparentBounds
  1.2326 +//
  1.2327 +//--------------------------------------------------------------------------------
  1.2328 +RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
  1.2329 +    fTransparentBounds = b;
  1.2330 +    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
  1.2331 +    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
  1.2332 +    return *this;
  1.2333 +}
  1.2334 +
  1.2335 +//--------------------------------------------------------------------------------
  1.2336 +//
  1.2337 +//     setTimeLimit
  1.2338 +//
  1.2339 +//--------------------------------------------------------------------------------
  1.2340 +void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
  1.2341 +    if (U_FAILURE(status)) {
  1.2342 +        return;
  1.2343 +    }
  1.2344 +    if (U_FAILURE(fDeferredStatus)) {
  1.2345 +        status = fDeferredStatus;
  1.2346 +        return;
  1.2347 +    }
  1.2348 +    if (limit < 0) {
  1.2349 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2350 +        return;
  1.2351 +    }
  1.2352 +    fTimeLimit = limit;
  1.2353 +}
  1.2354 +
  1.2355 +
  1.2356 +//--------------------------------------------------------------------------------
  1.2357 +//
  1.2358 +//     getTimeLimit
  1.2359 +//
  1.2360 +//--------------------------------------------------------------------------------
  1.2361 +int32_t RegexMatcher::getTimeLimit() const {
  1.2362 +    return fTimeLimit;
  1.2363 +}
  1.2364 +
  1.2365 +
  1.2366 +//--------------------------------------------------------------------------------
  1.2367 +//
  1.2368 +//     setStackLimit
  1.2369 +//
  1.2370 +//--------------------------------------------------------------------------------
  1.2371 +void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
  1.2372 +    if (U_FAILURE(status)) {
  1.2373 +        return;
  1.2374 +    }
  1.2375 +    if (U_FAILURE(fDeferredStatus)) {
  1.2376 +        status = fDeferredStatus;
  1.2377 +        return;
  1.2378 +    }
  1.2379 +    if (limit < 0) {
  1.2380 +        status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2381 +        return;
  1.2382 +    }
  1.2383 +    
  1.2384 +    // Reset the matcher.  This is needed here in case there is a current match
  1.2385 +    //    whose final stack frame (containing the match results, pointed to by fFrame) 
  1.2386 +    //    would be lost by resizing to a smaller stack size.
  1.2387 +    reset();
  1.2388 +    
  1.2389 +    if (limit == 0) {
  1.2390 +        // Unlimited stack expansion
  1.2391 +        fStack->setMaxCapacity(0);
  1.2392 +    } else {
  1.2393 +        // Change the units of the limit  from bytes to ints, and bump the size up
  1.2394 +        //   to be big enough to hold at least one stack frame for the pattern, 
  1.2395 +        //   if it isn't there already.
  1.2396 +        int32_t adjustedLimit = limit / sizeof(int32_t);
  1.2397 +        if (adjustedLimit < fPattern->fFrameSize) {
  1.2398 +            adjustedLimit = fPattern->fFrameSize;
  1.2399 +        }
  1.2400 +        fStack->setMaxCapacity(adjustedLimit);
  1.2401 +    }
  1.2402 +    fStackLimit = limit;
  1.2403 +}
  1.2404 +
  1.2405 +
  1.2406 +//--------------------------------------------------------------------------------
  1.2407 +//
  1.2408 +//     getStackLimit
  1.2409 +//
  1.2410 +//--------------------------------------------------------------------------------
  1.2411 +int32_t RegexMatcher::getStackLimit() const {
  1.2412 +    return fStackLimit;
  1.2413 +}
  1.2414 +
  1.2415 +
  1.2416 +//--------------------------------------------------------------------------------
  1.2417 +//
  1.2418 +//     setMatchCallback
  1.2419 +//
  1.2420 +//--------------------------------------------------------------------------------
  1.2421 +void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
  1.2422 +                                    const void              *context,
  1.2423 +                                    UErrorCode              &status) {
  1.2424 +    if (U_FAILURE(status)) {
  1.2425 +        return;
  1.2426 +    }
  1.2427 +    fCallbackFn = callback;
  1.2428 +    fCallbackContext = context;
  1.2429 +}
  1.2430 +
  1.2431 +
  1.2432 +//--------------------------------------------------------------------------------
  1.2433 +//
  1.2434 +//     getMatchCallback
  1.2435 +//
  1.2436 +//--------------------------------------------------------------------------------
  1.2437 +void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
  1.2438 +                                  const void              *&context,
  1.2439 +                                  UErrorCode              &status) {
  1.2440 +    if (U_FAILURE(status)) {
  1.2441 +       return;
  1.2442 +    }
  1.2443 +    callback = fCallbackFn;
  1.2444 +    context  = fCallbackContext;
  1.2445 +}
  1.2446 +
  1.2447 +
  1.2448 +//--------------------------------------------------------------------------------
  1.2449 +//
  1.2450 +//     setMatchCallback
  1.2451 +//
  1.2452 +//--------------------------------------------------------------------------------
  1.2453 +void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
  1.2454 +                                                const void                      *context,
  1.2455 +                                                UErrorCode                      &status) {
  1.2456 +    if (U_FAILURE(status)) {
  1.2457 +        return;
  1.2458 +    }
  1.2459 +    fFindProgressCallbackFn = callback;
  1.2460 +    fFindProgressCallbackContext = context;
  1.2461 +}
  1.2462 +
  1.2463 +
  1.2464 +//--------------------------------------------------------------------------------
  1.2465 +//
  1.2466 +//     getMatchCallback
  1.2467 +//
  1.2468 +//--------------------------------------------------------------------------------
  1.2469 +void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
  1.2470 +                                                const void                    *&context,
  1.2471 +                                                UErrorCode                    &status) {
  1.2472 +    if (U_FAILURE(status)) {
  1.2473 +       return;
  1.2474 +    }
  1.2475 +    callback = fFindProgressCallbackFn;
  1.2476 +    context  = fFindProgressCallbackContext;
  1.2477 +}
  1.2478 +
  1.2479 +
  1.2480 +//================================================================================
  1.2481 +//
  1.2482 +//    Code following this point in this file is the internal
  1.2483 +//    Match Engine Implementation.
  1.2484 +//
  1.2485 +//================================================================================
  1.2486 +
  1.2487 +
  1.2488 +//--------------------------------------------------------------------------------
  1.2489 +//
  1.2490 +//   resetStack
  1.2491 +//           Discard any previous contents of the state save stack, and initialize a
  1.2492 +//           new stack frame to all -1.  The -1s are needed for capture group limits, 
  1.2493 +//           where they indicate that a group has not yet matched anything.
  1.2494 +//--------------------------------------------------------------------------------
  1.2495 +REStackFrame *RegexMatcher::resetStack() {
  1.2496 +    // Discard any previous contents of the state save stack, and initialize a
  1.2497 +    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
  1.2498 +    //  where they indicate that a group has not yet matched anything.
  1.2499 +    fStack->removeAllElements();
  1.2500 +
  1.2501 +    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
  1.2502 +    int32_t i;
  1.2503 +    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
  1.2504 +        iFrame->fExtra[i] = -1;
  1.2505 +    }
  1.2506 +    return iFrame;
  1.2507 +}
  1.2508 +
  1.2509 +
  1.2510 +
  1.2511 +//--------------------------------------------------------------------------------
  1.2512 +//
  1.2513 +//   isWordBoundary 
  1.2514 +//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
  1.2515 +//                     For us,
  1.2516 +//                       If the current char is a combining mark,
  1.2517 +//                          \b is FALSE.
  1.2518 +//                       Else Scan backwards to the first non-combining char.
  1.2519 +//                            We are at a boundary if the this char and the original chars are
  1.2520 +//                               opposite in membership in \w set
  1.2521 +//
  1.2522 +//          parameters:   pos   - the current position in the input buffer
  1.2523 +//
  1.2524 +//              TODO:  double-check edge cases at region boundaries.
  1.2525 +//
  1.2526 +//--------------------------------------------------------------------------------
  1.2527 +UBool RegexMatcher::isWordBoundary(int64_t pos) {
  1.2528 +    UBool isBoundary = FALSE;
  1.2529 +    UBool cIsWord    = FALSE;
  1.2530 +    
  1.2531 +    if (pos >= fLookLimit) {
  1.2532 +        fHitEnd = TRUE;
  1.2533 +    } else {
  1.2534 +        // Determine whether char c at current position is a member of the word set of chars.
  1.2535 +        // If we're off the end of the string, behave as though we're not at a word char.
  1.2536 +        UTEXT_SETNATIVEINDEX(fInputText, pos);
  1.2537 +        UChar32  c = UTEXT_CURRENT32(fInputText);
  1.2538 +        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
  1.2539 +            // Current char is a combining one.  Not a boundary.
  1.2540 +            return FALSE;
  1.2541 +        }
  1.2542 +        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
  1.2543 +    }
  1.2544 +    
  1.2545 +    // Back up until we come to a non-combining char, determine whether
  1.2546 +    //  that char is a word char.
  1.2547 +    UBool prevCIsWord = FALSE;
  1.2548 +    for (;;) {
  1.2549 +        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
  1.2550 +            break;
  1.2551 +        }
  1.2552 +        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
  1.2553 +        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
  1.2554 +              || u_charType(prevChar) == U_FORMAT_CHAR)) {
  1.2555 +            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
  1.2556 +            break;
  1.2557 +        }
  1.2558 +    }
  1.2559 +    isBoundary = cIsWord ^ prevCIsWord;
  1.2560 +    return isBoundary;
  1.2561 +}
  1.2562 +
  1.2563 +UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
  1.2564 +    UBool isBoundary = FALSE;
  1.2565 +    UBool cIsWord    = FALSE;
  1.2566 +    
  1.2567 +    const UChar *inputBuf = fInputText->chunkContents;
  1.2568 +    
  1.2569 +    if (pos >= fLookLimit) {
  1.2570 +        fHitEnd = TRUE;
  1.2571 +    } else {
  1.2572 +        // Determine whether char c at current position is a member of the word set of chars.
  1.2573 +        // If we're off the end of the string, behave as though we're not at a word char.
  1.2574 +        UChar32 c;
  1.2575 +        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
  1.2576 +        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
  1.2577 +            // Current char is a combining one.  Not a boundary.
  1.2578 +            return FALSE;
  1.2579 +        }
  1.2580 +        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
  1.2581 +    }
  1.2582 +    
  1.2583 +    // Back up until we come to a non-combining char, determine whether
  1.2584 +    //  that char is a word char.
  1.2585 +    UBool prevCIsWord = FALSE;
  1.2586 +    for (;;) {
  1.2587 +        if (pos <= fLookStart) {
  1.2588 +            break;
  1.2589 +        }
  1.2590 +        UChar32 prevChar;
  1.2591 +        U16_PREV(inputBuf, fLookStart, pos, prevChar);
  1.2592 +        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
  1.2593 +              || u_charType(prevChar) == U_FORMAT_CHAR)) {
  1.2594 +            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
  1.2595 +            break;
  1.2596 +        }
  1.2597 +    }
  1.2598 +    isBoundary = cIsWord ^ prevCIsWord;
  1.2599 +    return isBoundary;
  1.2600 +}
  1.2601 +
  1.2602 +//--------------------------------------------------------------------------------
  1.2603 +//
  1.2604 +//   isUWordBoundary 
  1.2605 +//
  1.2606 +//         Test for a word boundary using RBBI word break.
  1.2607 +//
  1.2608 +//          parameters:   pos   - the current position in the input buffer
  1.2609 +//
  1.2610 +//--------------------------------------------------------------------------------
  1.2611 +UBool RegexMatcher::isUWordBoundary(int64_t pos) {
  1.2612 +    UBool       returnVal = FALSE;
  1.2613 +#if UCONFIG_NO_BREAK_ITERATION==0
  1.2614 +    
  1.2615 +    // If we haven't yet created a break iterator for this matcher, do it now.
  1.2616 +    if (fWordBreakItr == NULL) {
  1.2617 +        fWordBreakItr = 
  1.2618 +            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
  1.2619 +        if (U_FAILURE(fDeferredStatus)) {
  1.2620 +            return FALSE;
  1.2621 +        }
  1.2622 +        fWordBreakItr->setText(fInputText, fDeferredStatus);
  1.2623 +    }
  1.2624 +
  1.2625 +    if (pos >= fLookLimit) {
  1.2626 +        fHitEnd = TRUE;
  1.2627 +        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
  1.2628 +                            //    words are not boundaries.  All non-word chars stand by themselves,
  1.2629 +                            //    with word boundaries on both sides.
  1.2630 +    } else {
  1.2631 +        if (!UTEXT_USES_U16(fInputText)) {
  1.2632 +            // !!!: Would like a better way to do this!
  1.2633 +            UErrorCode status = U_ZERO_ERROR;
  1.2634 +            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
  1.2635 +        }
  1.2636 +        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
  1.2637 +    }
  1.2638 +#endif
  1.2639 +    return   returnVal;
  1.2640 +}
  1.2641 +
  1.2642 +//--------------------------------------------------------------------------------
  1.2643 +//
  1.2644 +//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
  1.2645 +//                     saves. Increment the "time" counter, and call the
  1.2646 +//                     user callback function if there is one installed.
  1.2647 +//
  1.2648 +//                     If the match operation needs to be aborted, either for a time-out
  1.2649 +//                     or because the user callback asked for it, just set an error status.
  1.2650 +//                     The engine will pick that up and stop in its outer loop.
  1.2651 +//
  1.2652 +//--------------------------------------------------------------------------------
  1.2653 +void RegexMatcher::IncrementTime(UErrorCode &status) {
  1.2654 +    fTickCounter = TIMER_INITIAL_VALUE;
  1.2655 +    fTime++;
  1.2656 +    if (fCallbackFn != NULL) {
  1.2657 +        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
  1.2658 +            status = U_REGEX_STOPPED_BY_CALLER;
  1.2659 +            return;
  1.2660 +        }
  1.2661 +    }
  1.2662 +    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
  1.2663 +        status = U_REGEX_TIME_OUT;
  1.2664 +    }
  1.2665 +}
  1.2666 +
  1.2667 +//--------------------------------------------------------------------------------
  1.2668 +//
  1.2669 +//   ReportFindProgress     This function is called once for each advance in the target
  1.2670 +//                          string from the find() function, and calls the user progress callback
  1.2671 +//                          function if there is one installed.
  1.2672 +//                          
  1.2673 +//                          NOTE:  
  1.2674 +//
  1.2675 +//                          If the match operation needs to be aborted because the user
  1.2676 +//                          callback asked for it, just set an error status.
  1.2677 +//                          The engine will pick that up and stop in its outer loop.
  1.2678 +//
  1.2679 +//--------------------------------------------------------------------------------
  1.2680 +UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
  1.2681 +    if (fFindProgressCallbackFn != NULL) {
  1.2682 +        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
  1.2683 +            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
  1.2684 +            return FALSE;
  1.2685 +        }
  1.2686 +    }
  1.2687 +    return TRUE;
  1.2688 +}
  1.2689 +
  1.2690 +//--------------------------------------------------------------------------------
  1.2691 +//
  1.2692 +//   StateSave
  1.2693 +//       Make a new stack frame, initialized as a copy of the current stack frame.
  1.2694 +//       Set the pattern index in the original stack frame from the operand value
  1.2695 +//       in the opcode.  Execution of the engine continues with the state in
  1.2696 +//       the newly created stack frame
  1.2697 +//
  1.2698 +//       Note that reserveBlock() may grow the stack, resulting in the
  1.2699 +//       whole thing being relocated in memory.
  1.2700 +//
  1.2701 +//    Parameters:
  1.2702 +//       fp           The top frame pointer when called.  At return, a new 
  1.2703 +//                    fame will be present
  1.2704 +//       savePatIdx   An index into the compiled pattern.  Goes into the original
  1.2705 +//                    (not new) frame.  If execution ever back-tracks out of the
  1.2706 +//                    new frame, this will be where we continue from in the pattern.
  1.2707 +//    Return
  1.2708 +//                    The new frame pointer.
  1.2709 +//
  1.2710 +//--------------------------------------------------------------------------------
  1.2711 +inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
  1.2712 +    // push storage for a new frame. 
  1.2713 +    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
  1.2714 +    if (newFP == NULL) {
  1.2715 +        // Failure on attempted stack expansion.
  1.2716 +        //   Stack function set some other error code, change it to a more
  1.2717 +        //   specific one for regular expressions.
  1.2718 +        status = U_REGEX_STACK_OVERFLOW;
  1.2719 +        // We need to return a writable stack frame, so just return the
  1.2720 +        //    previous frame.  The match operation will stop quickly
  1.2721 +        //    because of the error status, after which the frame will never
  1.2722 +        //    be looked at again.
  1.2723 +        return fp;
  1.2724 +    }
  1.2725 +    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
  1.2726 +    
  1.2727 +    // New stack frame = copy of old top frame.
  1.2728 +    int64_t *source = (int64_t *)fp;
  1.2729 +    int64_t *dest   = newFP;
  1.2730 +    for (;;) {
  1.2731 +        *dest++ = *source++;
  1.2732 +        if (source == newFP) {
  1.2733 +            break;
  1.2734 +        }
  1.2735 +    }
  1.2736 +    
  1.2737 +    fTickCounter--;
  1.2738 +    if (fTickCounter <= 0) {
  1.2739 +       IncrementTime(status);    // Re-initializes fTickCounter
  1.2740 +    }
  1.2741 +    fp->fPatIdx = savePatIdx;
  1.2742 +    return (REStackFrame *)newFP;
  1.2743 +}
  1.2744 +
  1.2745 +
  1.2746 +//--------------------------------------------------------------------------------
  1.2747 +//
  1.2748 +//   MatchAt      This is the actual matching engine.
  1.2749 +//
  1.2750 +//                  startIdx:    begin matching a this index.
  1.2751 +//                  toEnd:       if true, match must extend to end of the input region
  1.2752 +//
  1.2753 +//--------------------------------------------------------------------------------
  1.2754 +void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
  1.2755 +    UBool       isMatch  = FALSE;      // True if the we have a match.
  1.2756 +    
  1.2757 +    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
  1.2758 +
  1.2759 +    int32_t     op;                    // Operation from the compiled pattern, split into
  1.2760 +    int32_t     opType;                //    the opcode
  1.2761 +    int32_t     opValue;               //    and the operand value.
  1.2762 +        
  1.2763 +    #ifdef REGEX_RUN_DEBUG
  1.2764 +    if (fTraceDebug)
  1.2765 +    {
  1.2766 +        printf("MatchAt(startIdx=%ld)\n", startIdx);
  1.2767 +        printf("Original Pattern: ");
  1.2768 +        UChar32 c = utext_next32From(fPattern->fPattern, 0);
  1.2769 +        while (c != U_SENTINEL) {
  1.2770 +            if (c<32 || c>256) {
  1.2771 +                c = '.';
  1.2772 +            }
  1.2773 +            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
  1.2774 +            
  1.2775 +            c = UTEXT_NEXT32(fPattern->fPattern);
  1.2776 +        }
  1.2777 +        printf("\n");
  1.2778 +        printf("Input String: ");
  1.2779 +        c = utext_next32From(fInputText, 0);
  1.2780 +        while (c != U_SENTINEL) {
  1.2781 +            if (c<32 || c>256) {
  1.2782 +                c = '.';
  1.2783 +            }
  1.2784 +            printf("%c", c);
  1.2785 +            
  1.2786 +            c = UTEXT_NEXT32(fInputText);
  1.2787 +        }
  1.2788 +        printf("\n");
  1.2789 +        printf("\n");
  1.2790 +    }
  1.2791 +    #endif
  1.2792 +
  1.2793 +    if (U_FAILURE(status)) {
  1.2794 +        return;
  1.2795 +    }
  1.2796 +
  1.2797 +    //  Cache frequently referenced items from the compiled pattern
  1.2798 +    //
  1.2799 +    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
  1.2800 +
  1.2801 +    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
  1.2802 +    UVector             *sets          = fPattern->fSets;
  1.2803 +
  1.2804 +    fFrameSize = fPattern->fFrameSize;
  1.2805 +    REStackFrame        *fp            = resetStack();
  1.2806 +
  1.2807 +    fp->fPatIdx   = 0;
  1.2808 +    fp->fInputIdx = startIdx;
  1.2809 +
  1.2810 +    // Zero out the pattern's static data
  1.2811 +    int32_t i;
  1.2812 +    for (i = 0; i<fPattern->fDataSize; i++) {
  1.2813 +        fData[i] = 0;
  1.2814 +    }
  1.2815 +
  1.2816 +    //
  1.2817 +    //  Main loop for interpreting the compiled pattern.
  1.2818 +    //  One iteration of the loop per pattern operation performed.
  1.2819 +    //
  1.2820 +    for (;;) {
  1.2821 +#if 0
  1.2822 +        if (_heapchk() != _HEAPOK) {
  1.2823 +            fprintf(stderr, "Heap Trouble\n");
  1.2824 +        }
  1.2825 +#endif
  1.2826 +        
  1.2827 +        op      = (int32_t)pat[fp->fPatIdx];
  1.2828 +        opType  = URX_TYPE(op);
  1.2829 +        opValue = URX_VAL(op);
  1.2830 +        #ifdef REGEX_RUN_DEBUG
  1.2831 +        if (fTraceDebug) {
  1.2832 +            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.2833 +            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
  1.2834 +                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
  1.2835 +            fPattern->dumpOp(fp->fPatIdx);
  1.2836 +        }
  1.2837 +        #endif
  1.2838 +        fp->fPatIdx++;
  1.2839 +        
  1.2840 +        switch (opType) {
  1.2841 +
  1.2842 +
  1.2843 +        case URX_NOP:
  1.2844 +            break;
  1.2845 +
  1.2846 +
  1.2847 +        case URX_BACKTRACK:
  1.2848 +            // Force a backtrack.  In some circumstances, the pattern compiler
  1.2849 +            //   will notice that the pattern can't possibly match anything, and will
  1.2850 +            //   emit one of these at that point.
  1.2851 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.2852 +            break;
  1.2853 +
  1.2854 +
  1.2855 +        case URX_ONECHAR:
  1.2856 +            if (fp->fInputIdx < fActiveLimit) {
  1.2857 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.2858 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.2859 +                if (c == opValue) {
  1.2860 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.2861 +                    break;
  1.2862 +                }
  1.2863 +            } else {
  1.2864 +                fHitEnd = TRUE;
  1.2865 +            }
  1.2866 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.2867 +            break;
  1.2868 +
  1.2869 +
  1.2870 +        case URX_STRING:
  1.2871 +            {
  1.2872 +                // Test input against a literal string.
  1.2873 +                // Strings require two slots in the compiled pattern, one for the
  1.2874 +                //   offset to the string text, and one for the length.
  1.2875 +
  1.2876 +                int32_t   stringStartIdx = opValue;
  1.2877 +                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
  1.2878 +                fp->fPatIdx++;
  1.2879 +                opType    = URX_TYPE(op);
  1.2880 +                int32_t stringLen = URX_VAL(op);
  1.2881 +                U_ASSERT(opType == URX_STRING_LEN);
  1.2882 +                U_ASSERT(stringLen >= 2);
  1.2883 +                                
  1.2884 +                const UChar *patternString = litText+stringStartIdx;
  1.2885 +                int32_t patternStringIndex = 0;
  1.2886 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.2887 +                UChar32 inputChar;
  1.2888 +                UChar32 patternChar;
  1.2889 +                UBool success = TRUE;
  1.2890 +                while (patternStringIndex < stringLen) {
  1.2891 +                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
  1.2892 +                        success = FALSE;
  1.2893 +                        fHitEnd = TRUE;
  1.2894 +                        break;
  1.2895 +                    }
  1.2896 +                    inputChar = UTEXT_NEXT32(fInputText);
  1.2897 +                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
  1.2898 +                    if (patternChar != inputChar) {
  1.2899 +                        success = FALSE;
  1.2900 +                        break;
  1.2901 +                    }
  1.2902 +                }
  1.2903 +                
  1.2904 +                if (success) {
  1.2905 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.2906 +                } else {
  1.2907 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.2908 +                }
  1.2909 +            }
  1.2910 +            break;
  1.2911 +
  1.2912 +
  1.2913 +        case URX_STATE_SAVE:
  1.2914 +            fp = StateSave(fp, opValue, status);
  1.2915 +            break;
  1.2916 +
  1.2917 +
  1.2918 +        case URX_END:
  1.2919 +            // The match loop will exit via this path on a successful match,
  1.2920 +            //   when we reach the end of the pattern.
  1.2921 +            if (toEnd && fp->fInputIdx != fActiveLimit) {
  1.2922 +                // The pattern matched, but not to the end of input.  Try some more.
  1.2923 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.2924 +                break;
  1.2925 +            }
  1.2926 +            isMatch = TRUE;
  1.2927 +            goto  breakFromLoop;
  1.2928 +
  1.2929 +        // Start and End Capture stack frame variables are laid out out like this:
  1.2930 +            //  fp->fExtra[opValue]  - The start of a completed capture group
  1.2931 +            //             opValue+1 - The end   of a completed capture group
  1.2932 +            //             opValue+2 - the start of a capture group whose end
  1.2933 +            //                          has not yet been reached (and might not ever be).
  1.2934 +        case URX_START_CAPTURE:
  1.2935 +            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  1.2936 +            fp->fExtra[opValue+2] = fp->fInputIdx;
  1.2937 +            break;
  1.2938 +
  1.2939 +
  1.2940 +        case URX_END_CAPTURE:
  1.2941 +            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  1.2942 +            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
  1.2943 +            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
  1.2944 +            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
  1.2945 +            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
  1.2946 +            break;
  1.2947 +
  1.2948 +
  1.2949 +        case URX_DOLLAR:                   //  $, test for End of line
  1.2950 +                                           //     or for position before new line at end of input
  1.2951 +            {
  1.2952 +                if (fp->fInputIdx >= fAnchorLimit) {
  1.2953 +                    // We really are at the end of input.  Success.
  1.2954 +                    fHitEnd = TRUE;
  1.2955 +                    fRequireEnd = TRUE;
  1.2956 +                    break;
  1.2957 +                }
  1.2958 +                
  1.2959 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.2960 +                
  1.2961 +                // If we are positioned just before a new-line that is located at the
  1.2962 +                //   end of input, succeed.
  1.2963 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.2964 +                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
  1.2965 +                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
  1.2966 +                        // If not in the middle of a CR/LF sequence
  1.2967 +                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
  1.2968 +                            // At new-line at end of input. Success
  1.2969 +                            fHitEnd = TRUE;
  1.2970 +                            fRequireEnd = TRUE;
  1.2971 +                            
  1.2972 +                            break;
  1.2973 +                        }
  1.2974 +                    }
  1.2975 +                } else {
  1.2976 +                    UChar32 nextC = UTEXT_NEXT32(fInputText);
  1.2977 +                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
  1.2978 +                        fHitEnd = TRUE;
  1.2979 +                        fRequireEnd = TRUE;
  1.2980 +                        break;                         // At CR/LF at end of input.  Success
  1.2981 +                    }
  1.2982 +                }
  1.2983 +
  1.2984 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.2985 +            }
  1.2986 +            break;
  1.2987 +
  1.2988 +
  1.2989 +         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
  1.2990 +            if (fp->fInputIdx >= fAnchorLimit) {
  1.2991 +                // Off the end of input.  Success.
  1.2992 +                fHitEnd = TRUE;
  1.2993 +                fRequireEnd = TRUE;
  1.2994 +                break;
  1.2995 +            } else {
  1.2996 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.2997 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.2998 +                // Either at the last character of input, or off the end.
  1.2999 +                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
  1.3000 +                    fHitEnd = TRUE;
  1.3001 +                    fRequireEnd = TRUE;
  1.3002 +                    break;
  1.3003 +                }
  1.3004 +            }
  1.3005 +
  1.3006 +            // Not at end of input.  Back-track out.
  1.3007 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3008 +            break;
  1.3009 +
  1.3010 +
  1.3011 +         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
  1.3012 +             {
  1.3013 +                 if (fp->fInputIdx >= fAnchorLimit) {
  1.3014 +                     // We really are at the end of input.  Success.
  1.3015 +                     fHitEnd = TRUE;
  1.3016 +                     fRequireEnd = TRUE;
  1.3017 +                     break;
  1.3018 +                 }
  1.3019 +                 // If we are positioned just before a new-line, succeed.
  1.3020 +                 // It makes no difference where the new-line is within the input.
  1.3021 +                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3022 +                 UChar32 c = UTEXT_CURRENT32(fInputText);
  1.3023 +                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
  1.3024 +                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
  1.3025 +                     //  In multi-line mode, hitting a new-line just before the end of input does not
  1.3026 +                     //   set the hitEnd or requireEnd flags
  1.3027 +                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
  1.3028 +                        break;
  1.3029 +                     }
  1.3030 +                 }
  1.3031 +                 // not at a new line.  Fail.
  1.3032 +                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3033 +             }
  1.3034 +             break;
  1.3035 +
  1.3036 +
  1.3037 +         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
  1.3038 +             {
  1.3039 +                 if (fp->fInputIdx >= fAnchorLimit) {
  1.3040 +                     // We really are at the end of input.  Success.
  1.3041 +                     fHitEnd = TRUE;
  1.3042 +                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
  1.3043 +                     break;               //   adding a new-line would not lose the match.
  1.3044 +                 }
  1.3045 +                 // If we are not positioned just before a new-line, the test fails; backtrack out.
  1.3046 +                 // It makes no difference where the new-line is within the input.
  1.3047 +                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3048 +                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
  1.3049 +                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3050 +                 }
  1.3051 +             }
  1.3052 +             break;
  1.3053 +
  1.3054 +
  1.3055 +       case URX_CARET:                    //  ^, test for start of line
  1.3056 +            if (fp->fInputIdx != fAnchorStart) {
  1.3057 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3058 +            }
  1.3059 +            break;
  1.3060 +
  1.3061 +
  1.3062 +       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
  1.3063 +           {
  1.3064 +               if (fp->fInputIdx == fAnchorStart) {
  1.3065 +                   // We are at the start input.  Success.
  1.3066 +                   break;
  1.3067 +               }
  1.3068 +               // Check whether character just before the current pos is a new-line
  1.3069 +               //   unless we are at the end of input
  1.3070 +               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3071 +               UChar32  c = UTEXT_PREVIOUS32(fInputText); 
  1.3072 +               if ((fp->fInputIdx < fAnchorLimit) && 
  1.3073 +                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  1.3074 +                   //  It's a new-line.  ^ is true.  Success.
  1.3075 +                   //  TODO:  what should be done with positions between a CR and LF?
  1.3076 +                   break;
  1.3077 +               }
  1.3078 +               // Not at the start of a line.  Fail.
  1.3079 +               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3080 +           }
  1.3081 +           break;
  1.3082 +
  1.3083 +
  1.3084 +       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
  1.3085 +           {
  1.3086 +               U_ASSERT(fp->fInputIdx >= fAnchorStart);
  1.3087 +               if (fp->fInputIdx <= fAnchorStart) {
  1.3088 +                   // We are at the start input.  Success.
  1.3089 +                   break;
  1.3090 +               }
  1.3091 +               // Check whether character just before the current pos is a new-line
  1.3092 +               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
  1.3093 +               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3094 +               UChar32  c = UTEXT_PREVIOUS32(fInputText);
  1.3095 +               if (c != 0x0a) {
  1.3096 +                   // Not at the start of a line.  Back-track out.
  1.3097 +                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3098 +               }
  1.3099 +           }
  1.3100 +           break;
  1.3101 +
  1.3102 +        case URX_BACKSLASH_B:          // Test for word boundaries
  1.3103 +            {
  1.3104 +                UBool success = isWordBoundary(fp->fInputIdx);
  1.3105 +                success ^= (UBool)(opValue != 0);     // flip sense for \B
  1.3106 +                if (!success) {
  1.3107 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3108 +                }
  1.3109 +            }
  1.3110 +            break;
  1.3111 +
  1.3112 +
  1.3113 +        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
  1.3114 +            {
  1.3115 +                UBool success = isUWordBoundary(fp->fInputIdx);
  1.3116 +                success ^= (UBool)(opValue != 0);     // flip sense for \B
  1.3117 +                if (!success) {
  1.3118 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3119 +                }
  1.3120 +            }
  1.3121 +            break;
  1.3122 +
  1.3123 +
  1.3124 +        case URX_BACKSLASH_D:            // Test for decimal digit
  1.3125 +            {
  1.3126 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3127 +                    fHitEnd = TRUE;
  1.3128 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3129 +                    break;
  1.3130 +                }
  1.3131 +
  1.3132 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3133 +
  1.3134 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3135 +                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
  1.3136 +                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
  1.3137 +                success ^= (UBool)(opValue != 0);        // flip sense for \D
  1.3138 +                if (success) {
  1.3139 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3140 +                } else {
  1.3141 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3142 +                }
  1.3143 +            }
  1.3144 +            break;
  1.3145 +
  1.3146 +
  1.3147 +        case URX_BACKSLASH_G:          // Test for position at end of previous match
  1.3148 +            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
  1.3149 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3150 +            }
  1.3151 +            break;
  1.3152 +
  1.3153 +
  1.3154 +        case URX_BACKSLASH_X:     
  1.3155 +            //  Match a Grapheme, as defined by Unicode TR 29.
  1.3156 +            //  Differs slightly from Perl, which consumes combining marks independently
  1.3157 +            //    of context.
  1.3158 +            {
  1.3159 +
  1.3160 +                // Fail if at end of input
  1.3161 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3162 +                    fHitEnd = TRUE;
  1.3163 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3164 +                    break;
  1.3165 +                }
  1.3166 +                
  1.3167 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3168 +
  1.3169 +                // Examine (and consume) the current char.
  1.3170 +                //   Dispatch into a little state machine, based on the char.
  1.3171 +                UChar32  c;
  1.3172 +                c = UTEXT_NEXT32(fInputText);
  1.3173 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3174 +                UnicodeSet **sets = fPattern->fStaticSets;
  1.3175 +                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
  1.3176 +                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
  1.3177 +                if (sets[URX_GC_L]->contains(c))       goto GC_L;
  1.3178 +                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  1.3179 +                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  1.3180 +                if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.3181 +                if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.3182 +                goto GC_Extend;
  1.3183 +
  1.3184 +
  1.3185 +
  1.3186 +GC_L:
  1.3187 +                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.3188 +                c = UTEXT_NEXT32(fInputText);
  1.3189 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3190 +                if (sets[URX_GC_L]->contains(c))       goto GC_L;
  1.3191 +                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  1.3192 +                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  1.3193 +                if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.3194 +                (void)UTEXT_PREVIOUS32(fInputText);
  1.3195 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3196 +                goto GC_Extend;
  1.3197 +
  1.3198 +GC_V:
  1.3199 +                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.3200 +                c = UTEXT_NEXT32(fInputText);
  1.3201 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3202 +                if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.3203 +                if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.3204 +                (void)UTEXT_PREVIOUS32(fInputText);
  1.3205 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3206 +                goto GC_Extend;
  1.3207 +
  1.3208 +GC_T:
  1.3209 +                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.3210 +                c = UTEXT_NEXT32(fInputText);
  1.3211 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3212 +                if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.3213 +                (void)UTEXT_PREVIOUS32(fInputText);
  1.3214 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3215 +                goto GC_Extend;
  1.3216 +
  1.3217 +GC_Extend:
  1.3218 +                // Combining characters are consumed here
  1.3219 +                for (;;) {
  1.3220 +                    if (fp->fInputIdx >= fActiveLimit) {
  1.3221 +                        break;
  1.3222 +                    }
  1.3223 +                    c = UTEXT_CURRENT32(fInputText);
  1.3224 +                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
  1.3225 +                        break;
  1.3226 +                    }
  1.3227 +                    (void)UTEXT_NEXT32(fInputText);
  1.3228 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3229 +                }
  1.3230 +                goto GC_Done;
  1.3231 +
  1.3232 +GC_Control:
  1.3233 +                // Most control chars stand alone (don't combine with combining chars),  
  1.3234 +                //   except for that CR/LF sequence is a single grapheme cluster.
  1.3235 +                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
  1.3236 +                    c = UTEXT_NEXT32(fInputText);
  1.3237 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3238 +                }
  1.3239 +
  1.3240 +GC_Done:
  1.3241 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3242 +                    fHitEnd = TRUE;
  1.3243 +                }
  1.3244 +                break;
  1.3245 +            }
  1.3246 +            
  1.3247 +
  1.3248 +
  1.3249 +
  1.3250 +        case URX_BACKSLASH_Z:          // Test for end of Input
  1.3251 +            if (fp->fInputIdx < fAnchorLimit) {
  1.3252 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3253 +            } else {
  1.3254 +                fHitEnd = TRUE;
  1.3255 +                fRequireEnd = TRUE;
  1.3256 +            }
  1.3257 +            break;
  1.3258 +
  1.3259 +
  1.3260 +
  1.3261 +        case URX_STATIC_SETREF:
  1.3262 +            {
  1.3263 +                // Test input character against one of the predefined sets
  1.3264 +                //    (Word Characters, for example)
  1.3265 +                // The high bit of the op value is a flag for the match polarity.
  1.3266 +                //    0:   success if input char is in set.
  1.3267 +                //    1:   success if input char is not in set.
  1.3268 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3269 +                    fHitEnd = TRUE;
  1.3270 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3271 +                    break;
  1.3272 +                }
  1.3273 +
  1.3274 +                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);  
  1.3275 +                opValue &= ~URX_NEG_SET;
  1.3276 +                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  1.3277 +
  1.3278 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3279 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3280 +                if (c < 256) {
  1.3281 +                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  1.3282 +                    if (s8->contains(c)) {
  1.3283 +                        success = !success;
  1.3284 +                    }
  1.3285 +                } else {
  1.3286 +                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
  1.3287 +                    if (s->contains(c)) {
  1.3288 +                        success = !success;
  1.3289 +                    }
  1.3290 +                }
  1.3291 +                if (success) {
  1.3292 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3293 +                } else {
  1.3294 +                    // the character wasn't in the set.
  1.3295 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3296 +                }
  1.3297 +            }
  1.3298 +            break;
  1.3299 +            
  1.3300 +
  1.3301 +        case URX_STAT_SETREF_N:
  1.3302 +            {
  1.3303 +                // Test input character for NOT being a member of  one of 
  1.3304 +                //    the predefined sets (Word Characters, for example)
  1.3305 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3306 +                    fHitEnd = TRUE;
  1.3307 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3308 +                    break;
  1.3309 +                }
  1.3310 +
  1.3311 +                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  1.3312 +
  1.3313 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3314 +                
  1.3315 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3316 +                if (c < 256) {
  1.3317 +                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  1.3318 +                    if (s8->contains(c) == FALSE) {
  1.3319 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3320 +                        break;
  1.3321 +                    }
  1.3322 +                } else {
  1.3323 +                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
  1.3324 +                    if (s->contains(c) == FALSE) {
  1.3325 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3326 +                        break;
  1.3327 +                    }
  1.3328 +                }
  1.3329 +                // the character wasn't in the set.
  1.3330 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3331 +            }
  1.3332 +            break;
  1.3333 +            
  1.3334 +
  1.3335 +        case URX_SETREF:
  1.3336 +            if (fp->fInputIdx >= fActiveLimit) {
  1.3337 +                fHitEnd = TRUE;
  1.3338 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3339 +                break;
  1.3340 +            } else {
  1.3341 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3342 +                    
  1.3343 +                // There is input left.  Pick up one char and test it for set membership.
  1.3344 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3345 +                U_ASSERT(opValue > 0 && opValue < sets->size());
  1.3346 +                if (c<256) {
  1.3347 +                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  1.3348 +                    if (s8->contains(c)) {
  1.3349 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3350 +                        break;
  1.3351 +                    }
  1.3352 +                } else {
  1.3353 +                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
  1.3354 +                    if (s->contains(c)) {
  1.3355 +                        // The character is in the set.  A Match.
  1.3356 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3357 +                        break;
  1.3358 +                    }
  1.3359 +                }
  1.3360 +                
  1.3361 +                // the character wasn't in the set.
  1.3362 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3363 +            }
  1.3364 +            break;
  1.3365 +
  1.3366 +
  1.3367 +        case URX_DOTANY:
  1.3368 +            {
  1.3369 +                // . matches anything, but stops at end-of-line.
  1.3370 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3371 +                    // At end of input.  Match failed.  Backtrack out.
  1.3372 +                    fHitEnd = TRUE;
  1.3373 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3374 +                    break;
  1.3375 +                }
  1.3376 +                
  1.3377 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3378 +                
  1.3379 +                // There is input left.  Advance over one char, unless we've hit end-of-line
  1.3380 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3381 +                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  1.3382 +                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  1.3383 +                    // End of line in normal mode.   . does not match.
  1.3384 +                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3385 +                    break;
  1.3386 +                }
  1.3387 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3388 +            }
  1.3389 +            break;
  1.3390 +
  1.3391 +
  1.3392 +        case URX_DOTANY_ALL:
  1.3393 +            {
  1.3394 +                // ., in dot-matches-all (including new lines) mode
  1.3395 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3396 +                    // At end of input.  Match failed.  Backtrack out.
  1.3397 +                    fHitEnd = TRUE;
  1.3398 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3399 +                    break;
  1.3400 +                }
  1.3401 +                
  1.3402 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3403 +                
  1.3404 +                // There is input left.  Advance over one char, except if we are
  1.3405 +                //   at a cr/lf, advance over both of them.
  1.3406 +                UChar32 c; 
  1.3407 +                c = UTEXT_NEXT32(fInputText);
  1.3408 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3409 +                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
  1.3410 +                    // In the case of a CR/LF, we need to advance over both.
  1.3411 +                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
  1.3412 +                    if (nextc == 0x0a) {
  1.3413 +                        (void)UTEXT_NEXT32(fInputText);
  1.3414 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3415 +                    }
  1.3416 +                }
  1.3417 +            }
  1.3418 +            break;
  1.3419 +
  1.3420 +
  1.3421 +        case URX_DOTANY_UNIX:
  1.3422 +            {
  1.3423 +                // '.' operator, matches all, but stops at end-of-line.
  1.3424 +                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
  1.3425 +                if (fp->fInputIdx >= fActiveLimit) {
  1.3426 +                    // At end of input.  Match failed.  Backtrack out.
  1.3427 +                    fHitEnd = TRUE;
  1.3428 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3429 +                    break;
  1.3430 +                }
  1.3431 +
  1.3432 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3433 +                
  1.3434 +                // There is input left.  Advance over one char, unless we've hit end-of-line
  1.3435 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3436 +                if (c == 0x0a) {
  1.3437 +                    // End of line in normal mode.   '.' does not match the \n
  1.3438 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3439 +                } else {
  1.3440 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3441 +                }
  1.3442 +            }
  1.3443 +            break;
  1.3444 +
  1.3445 +
  1.3446 +        case URX_JMP:
  1.3447 +            fp->fPatIdx = opValue;
  1.3448 +            break;
  1.3449 +
  1.3450 +        case URX_FAIL:
  1.3451 +            isMatch = FALSE;
  1.3452 +            goto breakFromLoop;
  1.3453 +
  1.3454 +        case URX_JMP_SAV:
  1.3455 +            U_ASSERT(opValue < fPattern->fCompiledPat->size());
  1.3456 +            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
  1.3457 +            fp->fPatIdx = opValue;                         // Then JMP.
  1.3458 +            break;
  1.3459 +
  1.3460 +        case URX_JMP_SAV_X:
  1.3461 +            // This opcode is used with (x)+, when x can match a zero length string.
  1.3462 +            // Same as JMP_SAV, except conditional on the match having made forward progress.
  1.3463 +            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
  1.3464 +            //   data address of the input position at the start of the loop.
  1.3465 +            {
  1.3466 +                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
  1.3467 +                int32_t  stoOp = (int32_t)pat[opValue-1];
  1.3468 +                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
  1.3469 +                int32_t  frameLoc = URX_VAL(stoOp);
  1.3470 +                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
  1.3471 +                int64_t prevInputIdx = fp->fExtra[frameLoc];
  1.3472 +                U_ASSERT(prevInputIdx <= fp->fInputIdx);
  1.3473 +                if (prevInputIdx < fp->fInputIdx) {
  1.3474 +                    // The match did make progress.  Repeat the loop.
  1.3475 +                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
  1.3476 +                    fp->fPatIdx = opValue;
  1.3477 +                    fp->fExtra[frameLoc] = fp->fInputIdx;
  1.3478 +                } 
  1.3479 +                // If the input position did not advance, we do nothing here,
  1.3480 +                //   execution will fall out of the loop.
  1.3481 +            }
  1.3482 +            break;
  1.3483 +
  1.3484 +        case URX_CTR_INIT:
  1.3485 +            {
  1.3486 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  1.3487 +                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  1.3488 +
  1.3489 +                // Pick up the three extra operands that CTR_INIT has, and
  1.3490 +                //    skip the pattern location counter past 
  1.3491 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.3492 +                fp->fPatIdx += 3;
  1.3493 +                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  1.3494 +                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  1.3495 +                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  1.3496 +                U_ASSERT(minCount>=0);
  1.3497 +                U_ASSERT(maxCount>=minCount || maxCount==-1);
  1.3498 +                U_ASSERT(loopLoc>=fp->fPatIdx);
  1.3499 +
  1.3500 +                if (minCount == 0) {
  1.3501 +                    fp = StateSave(fp, loopLoc+1, status);
  1.3502 +                }
  1.3503 +                if (maxCount == -1) {
  1.3504 +                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
  1.3505 +                } else if (maxCount == 0) {
  1.3506 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3507 +                }
  1.3508 +            }
  1.3509 +            break;
  1.3510 +
  1.3511 +        case URX_CTR_LOOP:
  1.3512 +            {
  1.3513 +                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  1.3514 +                int32_t initOp = (int32_t)pat[opValue];
  1.3515 +                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
  1.3516 +                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  1.3517 +                int32_t minCount  = (int32_t)pat[opValue+2];
  1.3518 +                int32_t maxCount  = (int32_t)pat[opValue+3];
  1.3519 +                (*pCounter)++;
  1.3520 +                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  1.3521 +                    U_ASSERT(*pCounter == maxCount);
  1.3522 +                    break;
  1.3523 +                }
  1.3524 +                if (*pCounter >= minCount) {
  1.3525 +                    if (maxCount == -1) {
  1.3526 +                        // Loop has no hard upper bound.
  1.3527 +                        // Check that it is progressing through the input, break if it is not.
  1.3528 +                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  1.3529 +                        if (fp->fInputIdx == *pLastInputIdx) {
  1.3530 +                            break;
  1.3531 +                        } else {
  1.3532 +                            *pLastInputIdx = fp->fInputIdx;
  1.3533 +                        }
  1.3534 +                    }
  1.3535 +                    fp = StateSave(fp, fp->fPatIdx, status);
  1.3536 +                }
  1.3537 +                fp->fPatIdx = opValue + 4;    // Loop back.
  1.3538 +            }
  1.3539 +            break;
  1.3540 +
  1.3541 +        case URX_CTR_INIT_NG:
  1.3542 +            {
  1.3543 +                // Initialize a non-greedy loop
  1.3544 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  1.3545 +                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  1.3546 +
  1.3547 +                // Pick up the three extra operands that CTR_INIT_NG has, and
  1.3548 +                //    skip the pattern location counter past 
  1.3549 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.3550 +                fp->fPatIdx += 3;
  1.3551 +                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  1.3552 +                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  1.3553 +                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  1.3554 +                U_ASSERT(minCount>=0);
  1.3555 +                U_ASSERT(maxCount>=minCount || maxCount==-1);
  1.3556 +                U_ASSERT(loopLoc>fp->fPatIdx);
  1.3557 +                if (maxCount == -1) {
  1.3558 +                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
  1.3559 +                }
  1.3560 +
  1.3561 +                if (minCount == 0) {
  1.3562 +                    if (maxCount != 0) {
  1.3563 +                        fp = StateSave(fp, fp->fPatIdx, status);
  1.3564 +                    }
  1.3565 +                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
  1.3566 +                } 
  1.3567 +            }
  1.3568 +            break;
  1.3569 +
  1.3570 +        case URX_CTR_LOOP_NG:
  1.3571 +            {
  1.3572 +                // Non-greedy {min, max} loops
  1.3573 +                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  1.3574 +                int32_t initOp = (int32_t)pat[opValue];
  1.3575 +                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
  1.3576 +                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  1.3577 +                int32_t minCount  = (int32_t)pat[opValue+2];
  1.3578 +                int32_t maxCount  = (int32_t)pat[opValue+3];
  1.3579 +
  1.3580 +                (*pCounter)++;
  1.3581 +                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  1.3582 +                    // The loop has matched the maximum permitted number of times.
  1.3583 +                    //   Break out of here with no action.  Matching will
  1.3584 +                    //   continue with the following pattern.
  1.3585 +                    U_ASSERT(*pCounter == maxCount);
  1.3586 +                    break;
  1.3587 +                }
  1.3588 +
  1.3589 +                if (*pCounter < minCount) {
  1.3590 +                    // We haven't met the minimum number of matches yet.
  1.3591 +                    //   Loop back for another one.
  1.3592 +                    fp->fPatIdx = opValue + 4;    // Loop back.
  1.3593 +                } else {
  1.3594 +                    // We do have the minimum number of matches.
  1.3595 +
  1.3596 +                    // If there is no upper bound on the loop iterations, check that the input index
  1.3597 +                    // is progressing, and stop the loop if it is not.
  1.3598 +                    if (maxCount == -1) {
  1.3599 +                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  1.3600 +                        if (fp->fInputIdx == *pLastInputIdx) {
  1.3601 +                            break;
  1.3602 +                        }
  1.3603 +                        *pLastInputIdx = fp->fInputIdx;
  1.3604 +                    }
  1.3605 +
  1.3606 +                    // Loop Continuation: we will fall into the pattern following the loop
  1.3607 +                    //   (non-greedy, don't execute loop body first), but first do
  1.3608 +                    //   a state save to the top of the loop, so that a match failure
  1.3609 +                    //   in the following pattern will try another iteration of the loop.
  1.3610 +                    fp = StateSave(fp, opValue + 4, status);
  1.3611 +                }
  1.3612 +            }
  1.3613 +            break;
  1.3614 +
  1.3615 +        case URX_STO_SP:
  1.3616 +            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  1.3617 +            fData[opValue] = fStack->size();
  1.3618 +            break;
  1.3619 +
  1.3620 +        case URX_LD_SP:
  1.3621 +            {
  1.3622 +                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  1.3623 +                int32_t newStackSize = (int32_t)fData[opValue];
  1.3624 +                U_ASSERT(newStackSize <= fStack->size());
  1.3625 +                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  1.3626 +                if (newFP == (int64_t *)fp) {
  1.3627 +                    break;
  1.3628 +                }
  1.3629 +                int32_t i;
  1.3630 +                for (i=0; i<fFrameSize; i++) {
  1.3631 +                    newFP[i] = ((int64_t *)fp)[i];
  1.3632 +                }
  1.3633 +                fp = (REStackFrame *)newFP;
  1.3634 +                fStack->setSize(newStackSize);
  1.3635 +            }
  1.3636 +            break;
  1.3637 +
  1.3638 +        case URX_BACKREF:
  1.3639 +            {
  1.3640 +                U_ASSERT(opValue < fFrameSize);
  1.3641 +                int64_t groupStartIdx = fp->fExtra[opValue];
  1.3642 +                int64_t groupEndIdx   = fp->fExtra[opValue+1];
  1.3643 +                U_ASSERT(groupStartIdx <= groupEndIdx);
  1.3644 +                if (groupStartIdx < 0) {
  1.3645 +                    // This capture group has not participated in the match thus far,
  1.3646 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  1.3647 +                    break;
  1.3648 +                }
  1.3649 +                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
  1.3650 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3651 +
  1.3652 +                //   Note: if the capture group match was of an empty string the backref
  1.3653 +                //         match succeeds.  Verified by testing:  Perl matches succeed 
  1.3654 +                //         in this case, so we do too.
  1.3655 +                
  1.3656 +                UBool success = TRUE;
  1.3657 +                for (;;) {
  1.3658 +                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
  1.3659 +                        success = TRUE;
  1.3660 +                        break;
  1.3661 +                    }
  1.3662 +                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
  1.3663 +                        success = FALSE;
  1.3664 +                        fHitEnd = TRUE;
  1.3665 +                        break;
  1.3666 +                    }
  1.3667 +                    UChar32 captureGroupChar = utext_next32(fAltInputText);
  1.3668 +                    UChar32 inputChar = utext_next32(fInputText);
  1.3669 +                    if (inputChar != captureGroupChar) {
  1.3670 +                        success = FALSE;
  1.3671 +                        break;
  1.3672 +                    }
  1.3673 +                }
  1.3674 +
  1.3675 +                if (success) {
  1.3676 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3677 +                } else {
  1.3678 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3679 +                }
  1.3680 +            }
  1.3681 +            break;
  1.3682 +
  1.3683 +
  1.3684 +
  1.3685 +        case URX_BACKREF_I:
  1.3686 +            {
  1.3687 +                U_ASSERT(opValue < fFrameSize);
  1.3688 +                int64_t groupStartIdx = fp->fExtra[opValue];
  1.3689 +                int64_t groupEndIdx   = fp->fExtra[opValue+1];
  1.3690 +                U_ASSERT(groupStartIdx <= groupEndIdx);
  1.3691 +                if (groupStartIdx < 0) {
  1.3692 +                    // This capture group has not participated in the match thus far,
  1.3693 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  1.3694 +                    break;
  1.3695 +                }
  1.3696 +                utext_setNativeIndex(fAltInputText, groupStartIdx);
  1.3697 +                utext_setNativeIndex(fInputText, fp->fInputIdx);
  1.3698 +                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
  1.3699 +                CaseFoldingUTextIterator inputItr(*fInputText);
  1.3700 +
  1.3701 +                //   Note: if the capture group match was of an empty string the backref
  1.3702 +                //         match succeeds.  Verified by testing:  Perl matches succeed 
  1.3703 +                //         in this case, so we do too.
  1.3704 +                
  1.3705 +                UBool success = TRUE;
  1.3706 +                for (;;) {
  1.3707 +                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
  1.3708 +                        success = TRUE;
  1.3709 +                        break;
  1.3710 +                    }
  1.3711 +                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
  1.3712 +                        success = FALSE;
  1.3713 +                        fHitEnd = TRUE;
  1.3714 +                        break;
  1.3715 +                    }
  1.3716 +                    UChar32 captureGroupChar = captureGroupItr.next();
  1.3717 +                    UChar32 inputChar = inputItr.next();
  1.3718 +                    if (inputChar != captureGroupChar) {
  1.3719 +                        success = FALSE;
  1.3720 +                        break;
  1.3721 +                    }
  1.3722 +                }
  1.3723 +
  1.3724 +                if (success && inputItr.inExpansion()) {
  1.3725 +                    // We otained a match by consuming part of a string obtained from 
  1.3726 +                    // case-folding a single code point of the input text.  
  1.3727 +                    // This does not count as an overall match.
  1.3728 +                    success = FALSE;
  1.3729 +                }
  1.3730 +
  1.3731 +                if (success) {
  1.3732 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3733 +                } else {
  1.3734 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3735 +                }
  1.3736 + 
  1.3737 +            }
  1.3738 +            break;
  1.3739 +                
  1.3740 +        case URX_STO_INP_LOC:
  1.3741 +            {
  1.3742 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
  1.3743 +                fp->fExtra[opValue] = fp->fInputIdx;
  1.3744 +            }
  1.3745 +            break;
  1.3746 +
  1.3747 +        case URX_JMPX:
  1.3748 +            {
  1.3749 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.3750 +                fp->fPatIdx += 1;
  1.3751 +                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
  1.3752 +                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
  1.3753 +                int64_t savedInputIdx = fp->fExtra[dataLoc];
  1.3754 +                U_ASSERT(savedInputIdx <= fp->fInputIdx);
  1.3755 +                if (savedInputIdx < fp->fInputIdx) {
  1.3756 +                    fp->fPatIdx = opValue;                               // JMP
  1.3757 +                } else {
  1.3758 +                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
  1.3759 +                }
  1.3760 +            }
  1.3761 +            break;
  1.3762 +
  1.3763 +        case URX_LA_START:
  1.3764 +            {
  1.3765 +                // Entering a lookahead block.
  1.3766 +                // Save Stack Ptr, Input Pos.
  1.3767 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3768 +                fData[opValue]   = fStack->size();
  1.3769 +                fData[opValue+1] = fp->fInputIdx;
  1.3770 +                fActiveStart     = fLookStart;          // Set the match region change for
  1.3771 +                fActiveLimit     = fLookLimit;          //   transparent bounds.
  1.3772 +            }
  1.3773 +            break;
  1.3774 +
  1.3775 +        case URX_LA_END:
  1.3776 +            {
  1.3777 +                // Leaving a look-ahead block.
  1.3778 +                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
  1.3779 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3780 +                int32_t stackSize = fStack->size();
  1.3781 +                int32_t newStackSize =(int32_t)fData[opValue];
  1.3782 +                U_ASSERT(stackSize >= newStackSize);
  1.3783 +                if (stackSize > newStackSize) {
  1.3784 +                    // Copy the current top frame back to the new (cut back) top frame.
  1.3785 +                    //   This makes the capture groups from within the look-ahead
  1.3786 +                    //   expression available.
  1.3787 +                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  1.3788 +                    int32_t i;
  1.3789 +                    for (i=0; i<fFrameSize; i++) {
  1.3790 +                        newFP[i] = ((int64_t *)fp)[i];
  1.3791 +                    }
  1.3792 +                    fp = (REStackFrame *)newFP;
  1.3793 +                    fStack->setSize(newStackSize);
  1.3794 +                }
  1.3795 +                fp->fInputIdx = fData[opValue+1];
  1.3796 +
  1.3797 +                // Restore the active region bounds in the input string; they may have
  1.3798 +                //    been changed because of transparent bounds on a Region.
  1.3799 +                fActiveStart = fRegionStart;
  1.3800 +                fActiveLimit = fRegionLimit;
  1.3801 +            }
  1.3802 +            break;
  1.3803 +
  1.3804 +        case URX_ONECHAR_I:
  1.3805 +            // Case insensitive one char.  The char from the pattern is already case folded.
  1.3806 +            // Input text is not, but case folding the input can not reduce two or more code
  1.3807 +            // points to one.
  1.3808 +            if (fp->fInputIdx < fActiveLimit) {
  1.3809 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3810 +
  1.3811 +                UChar32 c = UTEXT_NEXT32(fInputText);
  1.3812 +                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
  1.3813 +                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3814 +                    break;
  1.3815 +                }
  1.3816 +            } else {
  1.3817 +                fHitEnd = TRUE;
  1.3818 +            }
  1.3819 +            
  1.3820 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3821 +            break;
  1.3822 +
  1.3823 +        case URX_STRING_I:
  1.3824 +            {
  1.3825 +                // Case-insensitive test input against a literal string.
  1.3826 +                // Strings require two slots in the compiled pattern, one for the
  1.3827 +                //   offset to the string text, and one for the length.
  1.3828 +                //   The compiled string has already been case folded.
  1.3829 +                {
  1.3830 +                    const UChar *patternString = litText + opValue;
  1.3831 +                    int32_t      patternStringIdx  = 0;
  1.3832 +
  1.3833 +                    op      = (int32_t)pat[fp->fPatIdx];
  1.3834 +                    fp->fPatIdx++;
  1.3835 +                    opType  = URX_TYPE(op);
  1.3836 +                    opValue = URX_VAL(op);
  1.3837 +                    U_ASSERT(opType == URX_STRING_LEN);
  1.3838 +                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
  1.3839 +                
  1.3840 +                    
  1.3841 +                    UChar32   cPattern;
  1.3842 +                    UChar32   cText;
  1.3843 +                    UBool     success = TRUE;
  1.3844 +
  1.3845 +                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.3846 +                    CaseFoldingUTextIterator inputIterator(*fInputText);
  1.3847 +                    while (patternStringIdx < patternStringLen) {
  1.3848 +                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
  1.3849 +                            success = FALSE;
  1.3850 +                            fHitEnd = TRUE;
  1.3851 +                            break;
  1.3852 +                        }
  1.3853 +                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
  1.3854 +                        cText = inputIterator.next();
  1.3855 +                        if (cText != cPattern) {
  1.3856 +                            success = FALSE;
  1.3857 +                            break;
  1.3858 +                        }
  1.3859 +                    }
  1.3860 +                    if (inputIterator.inExpansion()) {
  1.3861 +                        success = FALSE;
  1.3862 +                    }
  1.3863 +
  1.3864 +                    if (success) {
  1.3865 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3866 +                    } else {
  1.3867 +                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3868 +                    }
  1.3869 +                }
  1.3870 +            }
  1.3871 +            break;
  1.3872 +
  1.3873 +        case URX_LB_START:
  1.3874 +            {
  1.3875 +                // Entering a look-behind block.
  1.3876 +                // Save Stack Ptr, Input Pos.
  1.3877 +                //   TODO:  implement transparent bounds.  Ticket #6067
  1.3878 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3879 +                fData[opValue]   = fStack->size();
  1.3880 +                fData[opValue+1] = fp->fInputIdx;
  1.3881 +                // Init the variable containing the start index for attempted matches.
  1.3882 +                fData[opValue+2] = -1;
  1.3883 +                // Save input string length, then reset to pin any matches to end at
  1.3884 +                //   the current position.
  1.3885 +                fData[opValue+3] = fActiveLimit;
  1.3886 +                fActiveLimit     = fp->fInputIdx;
  1.3887 +            }
  1.3888 +            break;
  1.3889 +
  1.3890 +
  1.3891 +        case URX_LB_CONT:
  1.3892 +            {
  1.3893 +                // Positive Look-Behind, at top of loop checking for matches of LB expression
  1.3894 +                //    at all possible input starting positions.
  1.3895 +
  1.3896 +                // Fetch the min and max possible match lengths.  They are the operands
  1.3897 +                //   of this op in the pattern.
  1.3898 +                int32_t minML = (int32_t)pat[fp->fPatIdx++];
  1.3899 +                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
  1.3900 +                U_ASSERT(minML <= maxML);
  1.3901 +                U_ASSERT(minML >= 0);
  1.3902 +
  1.3903 +                // Fetch (from data) the last input index where a match was attempted.
  1.3904 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3905 +                int64_t  *lbStartIdx = &fData[opValue+2];
  1.3906 +                if (*lbStartIdx < 0) {
  1.3907 +                    // First time through loop.
  1.3908 +                    *lbStartIdx = fp->fInputIdx - minML;
  1.3909 +                } else {
  1.3910 +                    // 2nd through nth time through the loop.
  1.3911 +                    // Back up start position for match by one.
  1.3912 +                    if (*lbStartIdx == 0) {
  1.3913 +                        (*lbStartIdx)--;
  1.3914 +                    } else {
  1.3915 +                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
  1.3916 +                        (void)UTEXT_PREVIOUS32(fInputText);
  1.3917 +                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3918 +                    }
  1.3919 +                }
  1.3920 +
  1.3921 +                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  1.3922 +                    // We have tried all potential match starting points without
  1.3923 +                    //  getting a match.  Backtrack out, and out of the
  1.3924 +                    //   Look Behind altogether.
  1.3925 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3926 +                    int64_t restoreInputLen = fData[opValue+3];
  1.3927 +                    U_ASSERT(restoreInputLen >= fActiveLimit);
  1.3928 +                    U_ASSERT(restoreInputLen <= fInputLength);
  1.3929 +                    fActiveLimit = restoreInputLen;
  1.3930 +                    break;
  1.3931 +                }
  1.3932 +
  1.3933 +                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  1.3934 +                //      (successful match will fall off the end of the loop.)
  1.3935 +                fp = StateSave(fp, fp->fPatIdx-3, status);
  1.3936 +                fp->fInputIdx = *lbStartIdx;
  1.3937 +            }
  1.3938 +            break;
  1.3939 +
  1.3940 +        case URX_LB_END:
  1.3941 +            // End of a look-behind block, after a successful match.
  1.3942 +            {
  1.3943 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3944 +                if (fp->fInputIdx != fActiveLimit) {
  1.3945 +                    //  The look-behind expression matched, but the match did not
  1.3946 +                    //    extend all the way to the point that we are looking behind from.
  1.3947 +                    //  FAIL out of here, which will take us back to the LB_CONT, which
  1.3948 +                    //     will retry the match starting at another position or fail
  1.3949 +                    //     the look-behind altogether, whichever is appropriate.
  1.3950 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.3951 +                    break;
  1.3952 +                }
  1.3953 +
  1.3954 +                // Look-behind match is good.  Restore the orignal input string length,
  1.3955 +                //   which had been truncated to pin the end of the lookbehind match to the 
  1.3956 +                //   position being looked-behind.
  1.3957 +                int64_t originalInputLen = fData[opValue+3];
  1.3958 +                U_ASSERT(originalInputLen >= fActiveLimit);
  1.3959 +                U_ASSERT(originalInputLen <= fInputLength);
  1.3960 +                fActiveLimit = originalInputLen;
  1.3961 +            }
  1.3962 +            break;
  1.3963 +
  1.3964 +
  1.3965 +        case URX_LBN_CONT:
  1.3966 +            {
  1.3967 +                // Negative Look-Behind, at top of loop checking for matches of LB expression
  1.3968 +                //    at all possible input starting positions.
  1.3969 +
  1.3970 +                // Fetch the extra parameters of this op.
  1.3971 +                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
  1.3972 +                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
  1.3973 +                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
  1.3974 +                        continueLoc = URX_VAL(continueLoc);
  1.3975 +                U_ASSERT(minML <= maxML);
  1.3976 +                U_ASSERT(minML >= 0);
  1.3977 +                U_ASSERT(continueLoc > fp->fPatIdx);
  1.3978 +
  1.3979 +                // Fetch (from data) the last input index where a match was attempted.
  1.3980 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.3981 +                int64_t  *lbStartIdx = &fData[opValue+2];
  1.3982 +                if (*lbStartIdx < 0) {
  1.3983 +                    // First time through loop.
  1.3984 +                    *lbStartIdx = fp->fInputIdx - minML;
  1.3985 +                } else {
  1.3986 +                    // 2nd through nth time through the loop.
  1.3987 +                    // Back up start position for match by one.
  1.3988 +                    if (*lbStartIdx == 0) {
  1.3989 +                        (*lbStartIdx)--;
  1.3990 +                    } else {
  1.3991 +                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
  1.3992 +                        (void)UTEXT_PREVIOUS32(fInputText);
  1.3993 +                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.3994 +                    }
  1.3995 +                }
  1.3996 +
  1.3997 +                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  1.3998 +                    // We have tried all potential match starting points without
  1.3999 +                    //  getting a match, which means that the negative lookbehind as
  1.4000 +                    //  a whole has succeeded.  Jump forward to the continue location
  1.4001 +                    int64_t restoreInputLen = fData[opValue+3];
  1.4002 +                    U_ASSERT(restoreInputLen >= fActiveLimit);
  1.4003 +                    U_ASSERT(restoreInputLen <= fInputLength);
  1.4004 +                    fActiveLimit = restoreInputLen;
  1.4005 +                    fp->fPatIdx = continueLoc;
  1.4006 +                    break;
  1.4007 +                }
  1.4008 +
  1.4009 +                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  1.4010 +                //      (successful match will cause a FAIL out of the loop altogether.)
  1.4011 +                fp = StateSave(fp, fp->fPatIdx-4, status);
  1.4012 +                fp->fInputIdx = *lbStartIdx;
  1.4013 +            }
  1.4014 +            break;
  1.4015 +
  1.4016 +        case URX_LBN_END:
  1.4017 +            // End of a negative look-behind block, after a successful match.
  1.4018 +            {
  1.4019 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.4020 +                if (fp->fInputIdx != fActiveLimit) {
  1.4021 +                    //  The look-behind expression matched, but the match did not
  1.4022 +                    //    extend all the way to the point that we are looking behind from.
  1.4023 +                    //  FAIL out of here, which will take us back to the LB_CONT, which
  1.4024 +                    //     will retry the match starting at another position or succeed
  1.4025 +                    //     the look-behind altogether, whichever is appropriate.
  1.4026 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4027 +                    break;
  1.4028 +                }
  1.4029 +
  1.4030 +                // Look-behind expression matched, which means look-behind test as
  1.4031 +                //   a whole Fails
  1.4032 +                
  1.4033 +                //   Restore the orignal input string length, which had been truncated 
  1.4034 +                //   inorder to pin the end of the lookbehind match  
  1.4035 +                //   to the position being looked-behind.
  1.4036 +                int64_t originalInputLen = fData[opValue+3];
  1.4037 +                U_ASSERT(originalInputLen >= fActiveLimit);
  1.4038 +                U_ASSERT(originalInputLen <= fInputLength);
  1.4039 +                fActiveLimit = originalInputLen;
  1.4040 +
  1.4041 +                // Restore original stack position, discarding any state saved
  1.4042 +                //   by the successful pattern match.
  1.4043 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.4044 +                int32_t newStackSize = (int32_t)fData[opValue];
  1.4045 +                U_ASSERT(fStack->size() > newStackSize);
  1.4046 +                fStack->setSize(newStackSize);
  1.4047 +                
  1.4048 +                //  FAIL, which will take control back to someplace 
  1.4049 +                //  prior to entering the look-behind test.
  1.4050 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4051 +            }
  1.4052 +            break;
  1.4053 +
  1.4054 +
  1.4055 +        case URX_LOOP_SR_I:
  1.4056 +            // Loop Initialization for the optimized implementation of
  1.4057 +            //     [some character set]*
  1.4058 +            //   This op scans through all matching input.
  1.4059 +            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  1.4060 +            {
  1.4061 +                U_ASSERT(opValue > 0 && opValue < sets->size());
  1.4062 +                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  1.4063 +                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
  1.4064 +
  1.4065 +                // Loop through input, until either the input is exhausted or
  1.4066 +                //   we reach a character that is not a member of the set.
  1.4067 +                int64_t ix = fp->fInputIdx;
  1.4068 +                UTEXT_SETNATIVEINDEX(fInputText, ix);
  1.4069 +                for (;;) {
  1.4070 +                    if (ix >= fActiveLimit) {
  1.4071 +                        fHitEnd = TRUE;
  1.4072 +                        break;
  1.4073 +                    }
  1.4074 +                    UChar32 c = UTEXT_NEXT32(fInputText);
  1.4075 +                    if (c<256) {
  1.4076 +                        if (s8->contains(c) == FALSE) {
  1.4077 +                            break;
  1.4078 +                        }
  1.4079 +                    } else {
  1.4080 +                        if (s->contains(c) == FALSE) {
  1.4081 +                            break;
  1.4082 +                        }
  1.4083 +                    }
  1.4084 +                    ix = UTEXT_GETNATIVEINDEX(fInputText);
  1.4085 +                }
  1.4086 +
  1.4087 +                // If there were no matching characters, skip over the loop altogether.
  1.4088 +                //   The loop doesn't run at all, a * op always succeeds.
  1.4089 +                if (ix == fp->fInputIdx) {
  1.4090 +                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
  1.4091 +                    break;
  1.4092 +                }
  1.4093 +
  1.4094 +                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  1.4095 +                //   must follow.  It's operand is the stack location
  1.4096 +                //   that holds the starting input index for the match of this [set]*
  1.4097 +                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  1.4098 +                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  1.4099 +                int32_t stackLoc = URX_VAL(loopcOp);
  1.4100 +                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  1.4101 +                fp->fExtra[stackLoc] = fp->fInputIdx;
  1.4102 +                fp->fInputIdx = ix;
  1.4103 +
  1.4104 +                // Save State to the URX_LOOP_C op that follows this one,
  1.4105 +                //   so that match failures in the following code will return to there.
  1.4106 +                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  1.4107 +                fp = StateSave(fp, fp->fPatIdx, status);
  1.4108 +                fp->fPatIdx++;
  1.4109 +            }
  1.4110 +            break;
  1.4111 +
  1.4112 +
  1.4113 +        case URX_LOOP_DOT_I:
  1.4114 +            // Loop Initialization for the optimized implementation of .*
  1.4115 +            //   This op scans through all remaining input.
  1.4116 +            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  1.4117 +            {
  1.4118 +                // Loop through input until the input is exhausted (we reach an end-of-line)
  1.4119 +                // In DOTALL mode, we can just go straight to the end of the input.
  1.4120 +                int64_t ix;
  1.4121 +                if ((opValue & 1) == 1) {
  1.4122 +                    // Dot-matches-All mode.  Jump straight to the end of the string.
  1.4123 +                    ix = fActiveLimit;
  1.4124 +                    fHitEnd = TRUE;
  1.4125 +                } else {
  1.4126 +                    // NOT DOT ALL mode.  Line endings do not match '.'
  1.4127 +                    // Scan forward until a line ending or end of input.
  1.4128 +                    ix = fp->fInputIdx;
  1.4129 +                    UTEXT_SETNATIVEINDEX(fInputText, ix);
  1.4130 +                    for (;;) {
  1.4131 +                        if (ix >= fActiveLimit) {
  1.4132 +                            fHitEnd = TRUE;
  1.4133 +                            break;
  1.4134 +                        }
  1.4135 +                        UChar32 c = UTEXT_NEXT32(fInputText);
  1.4136 +                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
  1.4137 +                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
  1.4138 +                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
  1.4139 +                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
  1.4140 +                                //  char is a line ending.  Exit the scanning loop.
  1.4141 +                                break;
  1.4142 +                            }
  1.4143 +                        }
  1.4144 +                        ix = UTEXT_GETNATIVEINDEX(fInputText);
  1.4145 +                    }
  1.4146 +                }
  1.4147 +
  1.4148 +                // If there were no matching characters, skip over the loop altogether.
  1.4149 +                //   The loop doesn't run at all, a * op always succeeds.
  1.4150 +                if (ix == fp->fInputIdx) {
  1.4151 +                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
  1.4152 +                    break;
  1.4153 +                }
  1.4154 +
  1.4155 +                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  1.4156 +                //   must follow.  It's operand is the stack location
  1.4157 +                //   that holds the starting input index for the match of this .*
  1.4158 +                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  1.4159 +                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  1.4160 +                int32_t stackLoc = URX_VAL(loopcOp);
  1.4161 +                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  1.4162 +                fp->fExtra[stackLoc] = fp->fInputIdx;
  1.4163 +                fp->fInputIdx = ix;
  1.4164 +
  1.4165 +                // Save State to the URX_LOOP_C op that follows this one,
  1.4166 +                //   so that match failures in the following code will return to there.
  1.4167 +                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  1.4168 +                fp = StateSave(fp, fp->fPatIdx, status);
  1.4169 +                fp->fPatIdx++;
  1.4170 +            }
  1.4171 +            break;
  1.4172 +
  1.4173 +
  1.4174 +        case URX_LOOP_C:
  1.4175 +            {
  1.4176 +                U_ASSERT(opValue>=0 && opValue<fFrameSize);
  1.4177 +                backSearchIndex = fp->fExtra[opValue];
  1.4178 +                U_ASSERT(backSearchIndex <= fp->fInputIdx);
  1.4179 +                if (backSearchIndex == fp->fInputIdx) {
  1.4180 +                    // We've backed up the input idx to the point that the loop started.
  1.4181 +                    // The loop is done.  Leave here without saving state.   
  1.4182 +                    //  Subsequent failures won't come back here.
  1.4183 +                    break;
  1.4184 +                }
  1.4185 +                // Set up for the next iteration of the loop, with input index
  1.4186 +                //   backed up by one from the last time through,
  1.4187 +                //   and a state save to this instruction in case the following code fails again.
  1.4188 +                //   (We're going backwards because this loop emulates stack unwinding, not
  1.4189 +                //    the initial scan forward.)
  1.4190 +                U_ASSERT(fp->fInputIdx > 0);
  1.4191 +                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.4192 +                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
  1.4193 +                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.4194 +                
  1.4195 +                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
  1.4196 +                if (prevC == 0x0a && 
  1.4197 +                    fp->fInputIdx > backSearchIndex &&
  1.4198 +                    twoPrevC == 0x0d) {
  1.4199 +                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
  1.4200 +                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
  1.4201 +                        // .*, stepping back over CRLF pair.
  1.4202 +                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
  1.4203 +                    }
  1.4204 +                }
  1.4205 +
  1.4206 +
  1.4207 +                fp = StateSave(fp, fp->fPatIdx-1, status);
  1.4208 +            }
  1.4209 +            break;
  1.4210 +
  1.4211 +
  1.4212 +
  1.4213 +        default:
  1.4214 +            // Trouble.  The compiled pattern contains an entry with an
  1.4215 +            //           unrecognized type tag.
  1.4216 +            U_ASSERT(FALSE);
  1.4217 +        }
  1.4218 +
  1.4219 +        if (U_FAILURE(status)) {
  1.4220 +            isMatch = FALSE;
  1.4221 +            break;
  1.4222 +        }
  1.4223 +    }
  1.4224 +    
  1.4225 +breakFromLoop:
  1.4226 +    fMatch = isMatch;
  1.4227 +    if (isMatch) {
  1.4228 +        fLastMatchEnd = fMatchEnd;
  1.4229 +        fMatchStart   = startIdx;
  1.4230 +        fMatchEnd     = fp->fInputIdx;
  1.4231 +        if (fTraceDebug) {
  1.4232 +            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
  1.4233 +        }
  1.4234 +    }
  1.4235 +    else
  1.4236 +    {
  1.4237 +        if (fTraceDebug) {
  1.4238 +            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
  1.4239 +        }
  1.4240 +    }
  1.4241 +
  1.4242 +    fFrame = fp;                // The active stack frame when the engine stopped.
  1.4243 +                                //   Contains the capture group results that we need to
  1.4244 +                                //    access later.
  1.4245 +    return;
  1.4246 +}
  1.4247 +
  1.4248 +
  1.4249 +//--------------------------------------------------------------------------------
  1.4250 +//
  1.4251 +//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
  1.4252 +//                  assumption that the entire string is available in the UText's
  1.4253 +//                  chunk buffer. For now, that means we can use int32_t indexes,
  1.4254 +//                  except for anything that needs to be saved (like group starts
  1.4255 +//                  and ends).
  1.4256 +//
  1.4257 +//                  startIdx:    begin matching a this index.
  1.4258 +//                  toEnd:       if true, match must extend to end of the input region
  1.4259 +//
  1.4260 +//--------------------------------------------------------------------------------
  1.4261 +void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
  1.4262 +    UBool       isMatch  = FALSE;      // True if the we have a match.
  1.4263 +    
  1.4264 +    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
  1.4265 +
  1.4266 +    int32_t     op;                    // Operation from the compiled pattern, split into
  1.4267 +    int32_t     opType;                //    the opcode
  1.4268 +    int32_t     opValue;               //    and the operand value.
  1.4269 +    
  1.4270 +#ifdef REGEX_RUN_DEBUG
  1.4271 +    if (fTraceDebug)
  1.4272 +    {
  1.4273 +        printf("MatchAt(startIdx=%d)\n", startIdx);
  1.4274 +        printf("Original Pattern: ");
  1.4275 +        UChar32 c = utext_next32From(fPattern->fPattern, 0);
  1.4276 +        while (c != U_SENTINEL) {
  1.4277 +            if (c<32 || c>256) {
  1.4278 +                c = '.';
  1.4279 +            }
  1.4280 +            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
  1.4281 +            
  1.4282 +            c = UTEXT_NEXT32(fPattern->fPattern);
  1.4283 +        }
  1.4284 +        printf("\n");
  1.4285 +        printf("Input String: ");
  1.4286 +        c = utext_next32From(fInputText, 0);
  1.4287 +        while (c != U_SENTINEL) {
  1.4288 +            if (c<32 || c>256) {
  1.4289 +                c = '.';
  1.4290 +            }
  1.4291 +            printf("%c", c);
  1.4292 +            
  1.4293 +            c = UTEXT_NEXT32(fInputText);
  1.4294 +        }
  1.4295 +        printf("\n");
  1.4296 +        printf("\n");
  1.4297 +    }
  1.4298 +#endif
  1.4299 +    
  1.4300 +    if (U_FAILURE(status)) {
  1.4301 +        return;
  1.4302 +    }
  1.4303 +    
  1.4304 +    //  Cache frequently referenced items from the compiled pattern
  1.4305 +    //
  1.4306 +    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
  1.4307 +    
  1.4308 +    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
  1.4309 +    UVector             *sets          = fPattern->fSets;
  1.4310 +    
  1.4311 +    const UChar         *inputBuf      = fInputText->chunkContents;
  1.4312 +    
  1.4313 +    fFrameSize = fPattern->fFrameSize;
  1.4314 +    REStackFrame        *fp            = resetStack();
  1.4315 +    
  1.4316 +    fp->fPatIdx   = 0;
  1.4317 +    fp->fInputIdx = startIdx;
  1.4318 +    
  1.4319 +    // Zero out the pattern's static data
  1.4320 +    int32_t i;
  1.4321 +    for (i = 0; i<fPattern->fDataSize; i++) {
  1.4322 +        fData[i] = 0;
  1.4323 +    }
  1.4324 +    
  1.4325 +    //
  1.4326 +    //  Main loop for interpreting the compiled pattern.
  1.4327 +    //  One iteration of the loop per pattern operation performed.
  1.4328 +    //
  1.4329 +    for (;;) {
  1.4330 +#if 0
  1.4331 +        if (_heapchk() != _HEAPOK) {
  1.4332 +            fprintf(stderr, "Heap Trouble\n");
  1.4333 +        }
  1.4334 +#endif
  1.4335 +        
  1.4336 +        op      = (int32_t)pat[fp->fPatIdx];
  1.4337 +        opType  = URX_TYPE(op);
  1.4338 +        opValue = URX_VAL(op);
  1.4339 +#ifdef REGEX_RUN_DEBUG
  1.4340 +        if (fTraceDebug) {
  1.4341 +            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
  1.4342 +            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
  1.4343 +                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
  1.4344 +            fPattern->dumpOp(fp->fPatIdx);
  1.4345 +        }
  1.4346 +#endif
  1.4347 +        fp->fPatIdx++;
  1.4348 +        
  1.4349 +        switch (opType) {
  1.4350 +                
  1.4351 +                
  1.4352 +        case URX_NOP:
  1.4353 +            break;
  1.4354 +            
  1.4355 +            
  1.4356 +        case URX_BACKTRACK:
  1.4357 +            // Force a backtrack.  In some circumstances, the pattern compiler
  1.4358 +            //   will notice that the pattern can't possibly match anything, and will
  1.4359 +            //   emit one of these at that point.
  1.4360 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4361 +            break;
  1.4362 +            
  1.4363 +            
  1.4364 +        case URX_ONECHAR:
  1.4365 +            if (fp->fInputIdx < fActiveLimit) {
  1.4366 +                UChar32 c;
  1.4367 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4368 +                if (c == opValue) {
  1.4369 +                    break;
  1.4370 +                }
  1.4371 +            } else {
  1.4372 +                fHitEnd = TRUE;
  1.4373 +            }
  1.4374 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4375 +            break;
  1.4376 +            
  1.4377 +            
  1.4378 +        case URX_STRING:
  1.4379 +            {
  1.4380 +                // Test input against a literal string.
  1.4381 +                // Strings require two slots in the compiled pattern, one for the
  1.4382 +                //   offset to the string text, and one for the length.
  1.4383 +                int32_t   stringStartIdx = opValue;
  1.4384 +                int32_t   stringLen;
  1.4385 +                
  1.4386 +                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
  1.4387 +                fp->fPatIdx++;
  1.4388 +                opType    = URX_TYPE(op);
  1.4389 +                stringLen = URX_VAL(op);
  1.4390 +                U_ASSERT(opType == URX_STRING_LEN);
  1.4391 +                U_ASSERT(stringLen >= 2);
  1.4392 +                
  1.4393 +                const UChar * pInp = inputBuf + fp->fInputIdx;
  1.4394 +                const UChar * pInpLimit = inputBuf + fActiveLimit;
  1.4395 +                const UChar * pPat = litText+stringStartIdx;
  1.4396 +                const UChar * pEnd = pInp + stringLen;
  1.4397 +                UBool success = TRUE;
  1.4398 +                while (pInp < pEnd) {
  1.4399 +                    if (pInp >= pInpLimit) {
  1.4400 +                        fHitEnd = TRUE;
  1.4401 +                        success = FALSE;
  1.4402 +                        break;
  1.4403 +                    }
  1.4404 +                    if (*pInp++ != *pPat++) {
  1.4405 +                        success = FALSE;
  1.4406 +                        break;
  1.4407 +                    }
  1.4408 +                }
  1.4409 +                
  1.4410 +                if (success) {
  1.4411 +                    fp->fInputIdx += stringLen;
  1.4412 +                } else {
  1.4413 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4414 +                }
  1.4415 +            }
  1.4416 +            break;
  1.4417 +            
  1.4418 +            
  1.4419 +        case URX_STATE_SAVE:
  1.4420 +            fp = StateSave(fp, opValue, status);
  1.4421 +            break;
  1.4422 +            
  1.4423 +            
  1.4424 +        case URX_END:
  1.4425 +            // The match loop will exit via this path on a successful match,
  1.4426 +            //   when we reach the end of the pattern.
  1.4427 +            if (toEnd && fp->fInputIdx != fActiveLimit) {
  1.4428 +                // The pattern matched, but not to the end of input.  Try some more.
  1.4429 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4430 +                break;
  1.4431 +            }
  1.4432 +            isMatch = TRUE;
  1.4433 +            goto  breakFromLoop;
  1.4434 +            
  1.4435 +            // Start and End Capture stack frame variables are laid out out like this:
  1.4436 +            //  fp->fExtra[opValue]  - The start of a completed capture group
  1.4437 +            //             opValue+1 - The end   of a completed capture group
  1.4438 +            //             opValue+2 - the start of a capture group whose end
  1.4439 +            //                          has not yet been reached (and might not ever be).
  1.4440 +        case URX_START_CAPTURE:
  1.4441 +            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  1.4442 +            fp->fExtra[opValue+2] = fp->fInputIdx;
  1.4443 +            break;
  1.4444 +            
  1.4445 +            
  1.4446 +        case URX_END_CAPTURE:
  1.4447 +            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
  1.4448 +            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
  1.4449 +            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
  1.4450 +            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
  1.4451 +            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
  1.4452 +            break;
  1.4453 +            
  1.4454 +            
  1.4455 +        case URX_DOLLAR:                   //  $, test for End of line
  1.4456 +            //     or for position before new line at end of input
  1.4457 +            if (fp->fInputIdx < fAnchorLimit-2) {
  1.4458 +                // We are no where near the end of input.  Fail.
  1.4459 +                //   This is the common case.  Keep it first.
  1.4460 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4461 +                break;
  1.4462 +            }
  1.4463 +            if (fp->fInputIdx >= fAnchorLimit) {
  1.4464 +                // We really are at the end of input.  Success.
  1.4465 +                fHitEnd = TRUE;
  1.4466 +                fRequireEnd = TRUE;
  1.4467 +                break;
  1.4468 +            }
  1.4469 +            
  1.4470 +            // If we are positioned just before a new-line that is located at the
  1.4471 +            //   end of input, succeed.
  1.4472 +            if (fp->fInputIdx == fAnchorLimit-1) {
  1.4473 +                UChar32 c;
  1.4474 +                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
  1.4475 +                
  1.4476 +                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
  1.4477 +                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
  1.4478 +                        // At new-line at end of input. Success
  1.4479 +                        fHitEnd = TRUE;
  1.4480 +                        fRequireEnd = TRUE;
  1.4481 +                        break;
  1.4482 +                    }
  1.4483 +                }
  1.4484 +            } else if (fp->fInputIdx == fAnchorLimit-2 &&
  1.4485 +                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
  1.4486 +                    fHitEnd = TRUE;
  1.4487 +                    fRequireEnd = TRUE;
  1.4488 +                    break;                         // At CR/LF at end of input.  Success
  1.4489 +            }
  1.4490 +            
  1.4491 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4492 +            
  1.4493 +            break;
  1.4494 +            
  1.4495 +            
  1.4496 +        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
  1.4497 +            if (fp->fInputIdx >= fAnchorLimit-1) {
  1.4498 +                // Either at the last character of input, or off the end.
  1.4499 +                if (fp->fInputIdx == fAnchorLimit-1) {
  1.4500 +                    // At last char of input.  Success if it's a new line.
  1.4501 +                    if (inputBuf[fp->fInputIdx] == 0x0a) {
  1.4502 +                        fHitEnd = TRUE;
  1.4503 +                        fRequireEnd = TRUE;
  1.4504 +                        break;
  1.4505 +                    }
  1.4506 +                } else {
  1.4507 +                    // Off the end of input.  Success.
  1.4508 +                    fHitEnd = TRUE;
  1.4509 +                    fRequireEnd = TRUE;
  1.4510 +                    break;
  1.4511 +                }
  1.4512 +            }
  1.4513 +            
  1.4514 +            // Not at end of input.  Back-track out.
  1.4515 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4516 +            break;
  1.4517 +            
  1.4518 +            
  1.4519 +        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
  1.4520 +            {
  1.4521 +                if (fp->fInputIdx >= fAnchorLimit) {
  1.4522 +                    // We really are at the end of input.  Success.
  1.4523 +                    fHitEnd = TRUE;
  1.4524 +                    fRequireEnd = TRUE;
  1.4525 +                    break;
  1.4526 +                }
  1.4527 +                // If we are positioned just before a new-line, succeed.
  1.4528 +                // It makes no difference where the new-line is within the input.
  1.4529 +                UChar32 c = inputBuf[fp->fInputIdx];
  1.4530 +                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
  1.4531 +                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
  1.4532 +                    //  In multi-line mode, hitting a new-line just before the end of input does not
  1.4533 +                    //   set the hitEnd or requireEnd flags
  1.4534 +                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
  1.4535 +                        break;
  1.4536 +                    }
  1.4537 +                }
  1.4538 +                // not at a new line.  Fail.
  1.4539 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4540 +            }
  1.4541 +            break;
  1.4542 +            
  1.4543 +            
  1.4544 +        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
  1.4545 +            {
  1.4546 +                if (fp->fInputIdx >= fAnchorLimit) {
  1.4547 +                    // We really are at the end of input.  Success.
  1.4548 +                    fHitEnd = TRUE;
  1.4549 +                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
  1.4550 +                    break;               //   adding a new-line would not lose the match.
  1.4551 +                }
  1.4552 +                // If we are not positioned just before a new-line, the test fails; backtrack out.
  1.4553 +                // It makes no difference where the new-line is within the input.
  1.4554 +                if (inputBuf[fp->fInputIdx] != 0x0a) {
  1.4555 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4556 +                }
  1.4557 +            }
  1.4558 +            break;
  1.4559 +            
  1.4560 +            
  1.4561 +        case URX_CARET:                    //  ^, test for start of line
  1.4562 +            if (fp->fInputIdx != fAnchorStart) {
  1.4563 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4564 +            }
  1.4565 +            break;
  1.4566 +            
  1.4567 +            
  1.4568 +        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
  1.4569 +            {
  1.4570 +                if (fp->fInputIdx == fAnchorStart) {
  1.4571 +                    // We are at the start input.  Success.
  1.4572 +                    break;
  1.4573 +                }
  1.4574 +                // Check whether character just before the current pos is a new-line
  1.4575 +                //   unless we are at the end of input
  1.4576 +                UChar  c = inputBuf[fp->fInputIdx - 1]; 
  1.4577 +                if ((fp->fInputIdx < fAnchorLimit) && 
  1.4578 +                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  1.4579 +                    //  It's a new-line.  ^ is true.  Success.
  1.4580 +                    //  TODO:  what should be done with positions between a CR and LF?
  1.4581 +                    break;
  1.4582 +                }
  1.4583 +                // Not at the start of a line.  Fail.
  1.4584 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4585 +            }
  1.4586 +            break;
  1.4587 +            
  1.4588 +            
  1.4589 +        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
  1.4590 +            {
  1.4591 +                U_ASSERT(fp->fInputIdx >= fAnchorStart);
  1.4592 +                if (fp->fInputIdx <= fAnchorStart) {
  1.4593 +                    // We are at the start input.  Success.
  1.4594 +                    break;
  1.4595 +                }
  1.4596 +                // Check whether character just before the current pos is a new-line
  1.4597 +                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
  1.4598 +                UChar  c = inputBuf[fp->fInputIdx - 1]; 
  1.4599 +                if (c != 0x0a) {
  1.4600 +                    // Not at the start of a line.  Back-track out.
  1.4601 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4602 +                }
  1.4603 +            }
  1.4604 +            break;
  1.4605 +            
  1.4606 +        case URX_BACKSLASH_B:          // Test for word boundaries
  1.4607 +            {
  1.4608 +                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
  1.4609 +                success ^= (UBool)(opValue != 0);     // flip sense for \B
  1.4610 +                if (!success) {
  1.4611 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4612 +                }
  1.4613 +            }
  1.4614 +            break;
  1.4615 +            
  1.4616 +            
  1.4617 +        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
  1.4618 +            {
  1.4619 +                UBool success = isUWordBoundary(fp->fInputIdx);
  1.4620 +                success ^= (UBool)(opValue != 0);     // flip sense for \B
  1.4621 +                if (!success) {
  1.4622 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4623 +                }
  1.4624 +            }
  1.4625 +            break;
  1.4626 +            
  1.4627 +            
  1.4628 +        case URX_BACKSLASH_D:            // Test for decimal digit
  1.4629 +            {
  1.4630 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4631 +                    fHitEnd = TRUE;
  1.4632 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4633 +                    break;
  1.4634 +                }
  1.4635 +                
  1.4636 +                UChar32 c;
  1.4637 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4638 +                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
  1.4639 +                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
  1.4640 +                success ^= (UBool)(opValue != 0);        // flip sense for \D
  1.4641 +                if (!success) {
  1.4642 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4643 +                }
  1.4644 +            }
  1.4645 +            break;
  1.4646 +            
  1.4647 +            
  1.4648 +        case URX_BACKSLASH_G:          // Test for position at end of previous match
  1.4649 +            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
  1.4650 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4651 +            }
  1.4652 +            break;
  1.4653 +            
  1.4654 +            
  1.4655 +        case URX_BACKSLASH_X:     
  1.4656 +        //  Match a Grapheme, as defined by Unicode TR 29.
  1.4657 +        //  Differs slightly from Perl, which consumes combining marks independently
  1.4658 +        //    of context.
  1.4659 +        {
  1.4660 +
  1.4661 +            // Fail if at end of input
  1.4662 +            if (fp->fInputIdx >= fActiveLimit) {
  1.4663 +                fHitEnd = TRUE;
  1.4664 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4665 +                break;
  1.4666 +            }
  1.4667 +
  1.4668 +            // Examine (and consume) the current char.
  1.4669 +            //   Dispatch into a little state machine, based on the char.
  1.4670 +            UChar32  c;
  1.4671 +            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4672 +            UnicodeSet **sets = fPattern->fStaticSets;
  1.4673 +            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
  1.4674 +            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
  1.4675 +            if (sets[URX_GC_L]->contains(c))       goto GC_L;
  1.4676 +            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  1.4677 +            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  1.4678 +            if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.4679 +            if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.4680 +            goto GC_Extend;
  1.4681 +
  1.4682 +
  1.4683 +
  1.4684 +GC_L:
  1.4685 +            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.4686 +            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4687 +            if (sets[URX_GC_L]->contains(c))       goto GC_L;
  1.4688 +            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
  1.4689 +            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
  1.4690 +            if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.4691 +            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  1.4692 +            goto GC_Extend;
  1.4693 +
  1.4694 +GC_V:
  1.4695 +            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.4696 +            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4697 +            if (sets[URX_GC_V]->contains(c))       goto GC_V;
  1.4698 +            if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.4699 +            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  1.4700 +            goto GC_Extend;
  1.4701 +
  1.4702 +GC_T:
  1.4703 +            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
  1.4704 +            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4705 +            if (sets[URX_GC_T]->contains(c))       goto GC_T;
  1.4706 +            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
  1.4707 +            goto GC_Extend;
  1.4708 +
  1.4709 +GC_Extend:
  1.4710 +            // Combining characters are consumed here
  1.4711 +            for (;;) {
  1.4712 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4713 +                    break;
  1.4714 +                }
  1.4715 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4716 +                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
  1.4717 +                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
  1.4718 +                    break;
  1.4719 +                }
  1.4720 +            }
  1.4721 +            goto GC_Done;
  1.4722 +
  1.4723 +GC_Control:
  1.4724 +            // Most control chars stand alone (don't combine with combining chars),  
  1.4725 +            //   except for that CR/LF sequence is a single grapheme cluster.
  1.4726 +            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
  1.4727 +                fp->fInputIdx++;
  1.4728 +            }
  1.4729 +
  1.4730 +GC_Done:
  1.4731 +            if (fp->fInputIdx >= fActiveLimit) {
  1.4732 +                fHitEnd = TRUE;
  1.4733 +            }
  1.4734 +            break;
  1.4735 +        }
  1.4736 +            
  1.4737 +            
  1.4738 +            
  1.4739 +            
  1.4740 +        case URX_BACKSLASH_Z:          // Test for end of Input
  1.4741 +            if (fp->fInputIdx < fAnchorLimit) {
  1.4742 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4743 +            } else {
  1.4744 +                fHitEnd = TRUE;
  1.4745 +                fRequireEnd = TRUE;
  1.4746 +            }
  1.4747 +            break;
  1.4748 +            
  1.4749 +            
  1.4750 +            
  1.4751 +        case URX_STATIC_SETREF:
  1.4752 +            {
  1.4753 +                // Test input character against one of the predefined sets
  1.4754 +                //    (Word Characters, for example)
  1.4755 +                // The high bit of the op value is a flag for the match polarity.
  1.4756 +                //    0:   success if input char is in set.
  1.4757 +                //    1:   success if input char is not in set.
  1.4758 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4759 +                    fHitEnd = TRUE;
  1.4760 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4761 +                    break;
  1.4762 +                }
  1.4763 +                
  1.4764 +                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);  
  1.4765 +                opValue &= ~URX_NEG_SET;
  1.4766 +                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  1.4767 +                
  1.4768 +                UChar32 c;
  1.4769 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4770 +                if (c < 256) {
  1.4771 +                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  1.4772 +                    if (s8->contains(c)) {
  1.4773 +                        success = !success;
  1.4774 +                    }
  1.4775 +                } else {
  1.4776 +                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
  1.4777 +                    if (s->contains(c)) {
  1.4778 +                        success = !success;
  1.4779 +                    }
  1.4780 +                }
  1.4781 +                if (!success) {
  1.4782 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4783 +                }
  1.4784 +            }
  1.4785 +            break;
  1.4786 +            
  1.4787 +            
  1.4788 +        case URX_STAT_SETREF_N:
  1.4789 +            {
  1.4790 +                // Test input character for NOT being a member of  one of 
  1.4791 +                //    the predefined sets (Word Characters, for example)
  1.4792 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4793 +                    fHitEnd = TRUE;
  1.4794 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4795 +                    break;
  1.4796 +                }
  1.4797 +                
  1.4798 +                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
  1.4799 +                
  1.4800 +                UChar32  c;
  1.4801 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4802 +                if (c < 256) {
  1.4803 +                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
  1.4804 +                    if (s8->contains(c) == FALSE) {
  1.4805 +                        break;
  1.4806 +                    }
  1.4807 +                } else {
  1.4808 +                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
  1.4809 +                    if (s->contains(c) == FALSE) {
  1.4810 +                        break;
  1.4811 +                    }
  1.4812 +                }
  1.4813 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4814 +            }
  1.4815 +            break;
  1.4816 +            
  1.4817 +            
  1.4818 +        case URX_SETREF:
  1.4819 +            {
  1.4820 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4821 +                    fHitEnd = TRUE;
  1.4822 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4823 +                    break;
  1.4824 +                }
  1.4825 +                
  1.4826 +                U_ASSERT(opValue > 0 && opValue < sets->size());
  1.4827 +
  1.4828 +                // There is input left.  Pick up one char and test it for set membership.
  1.4829 +                UChar32  c;
  1.4830 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4831 +                if (c<256) {
  1.4832 +                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  1.4833 +                    if (s8->contains(c)) {
  1.4834 +                        // The character is in the set.  A Match.
  1.4835 +                        break;
  1.4836 +                    }
  1.4837 +                } else {
  1.4838 +                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
  1.4839 +                    if (s->contains(c)) {
  1.4840 +                        // The character is in the set.  A Match.
  1.4841 +                        break;
  1.4842 +                    }
  1.4843 +                }
  1.4844 +                
  1.4845 +                // the character wasn't in the set.
  1.4846 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4847 +            }
  1.4848 +            break;
  1.4849 +            
  1.4850 +            
  1.4851 +        case URX_DOTANY:
  1.4852 +            {
  1.4853 +                // . matches anything, but stops at end-of-line.
  1.4854 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4855 +                    // At end of input.  Match failed.  Backtrack out.
  1.4856 +                    fHitEnd = TRUE;
  1.4857 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4858 +                    break;
  1.4859 +                }
  1.4860 +                
  1.4861 +                // There is input left.  Advance over one char, unless we've hit end-of-line
  1.4862 +                UChar32  c;
  1.4863 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4864 +                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
  1.4865 +                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
  1.4866 +                    // End of line in normal mode.   . does not match.
  1.4867 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4868 +                    break;
  1.4869 +                }
  1.4870 +            }
  1.4871 +            break;
  1.4872 +            
  1.4873 +            
  1.4874 +        case URX_DOTANY_ALL:
  1.4875 +            {
  1.4876 +                // . in dot-matches-all (including new lines) mode
  1.4877 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4878 +                    // At end of input.  Match failed.  Backtrack out.
  1.4879 +                    fHitEnd = TRUE;
  1.4880 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4881 +                    break;
  1.4882 +                }
  1.4883 +                
  1.4884 +                // There is input left.  Advance over one char, except if we are
  1.4885 +                //   at a cr/lf, advance over both of them.
  1.4886 +                UChar32 c; 
  1.4887 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4888 +                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
  1.4889 +                    // In the case of a CR/LF, we need to advance over both.
  1.4890 +                    if (inputBuf[fp->fInputIdx] == 0x0a) {
  1.4891 +                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
  1.4892 +                    }
  1.4893 +                }
  1.4894 +            }
  1.4895 +            break;
  1.4896 +            
  1.4897 +            
  1.4898 +        case URX_DOTANY_UNIX:
  1.4899 +            {
  1.4900 +                // '.' operator, matches all, but stops at end-of-line.
  1.4901 +                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
  1.4902 +                if (fp->fInputIdx >= fActiveLimit) {
  1.4903 +                    // At end of input.  Match failed.  Backtrack out.
  1.4904 +                    fHitEnd = TRUE;
  1.4905 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4906 +                    break;
  1.4907 +                }
  1.4908 +                
  1.4909 +                // There is input left.  Advance over one char, unless we've hit end-of-line
  1.4910 +                UChar32 c; 
  1.4911 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.4912 +                if (c == 0x0a) {
  1.4913 +                    // End of line in normal mode.   '.' does not match the \n
  1.4914 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4915 +                }
  1.4916 +            }
  1.4917 +            break;
  1.4918 +            
  1.4919 +            
  1.4920 +        case URX_JMP:
  1.4921 +            fp->fPatIdx = opValue;
  1.4922 +            break;
  1.4923 +            
  1.4924 +        case URX_FAIL:
  1.4925 +            isMatch = FALSE;
  1.4926 +            goto breakFromLoop;
  1.4927 +            
  1.4928 +        case URX_JMP_SAV:
  1.4929 +            U_ASSERT(opValue < fPattern->fCompiledPat->size());
  1.4930 +            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
  1.4931 +            fp->fPatIdx = opValue;                         // Then JMP.
  1.4932 +            break;
  1.4933 +            
  1.4934 +        case URX_JMP_SAV_X:
  1.4935 +            // This opcode is used with (x)+, when x can match a zero length string.
  1.4936 +            // Same as JMP_SAV, except conditional on the match having made forward progress.
  1.4937 +            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
  1.4938 +            //   data address of the input position at the start of the loop.
  1.4939 +            {
  1.4940 +                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
  1.4941 +                int32_t  stoOp = (int32_t)pat[opValue-1];
  1.4942 +                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
  1.4943 +                int32_t  frameLoc = URX_VAL(stoOp);
  1.4944 +                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
  1.4945 +                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
  1.4946 +                U_ASSERT(prevInputIdx <= fp->fInputIdx);
  1.4947 +                if (prevInputIdx < fp->fInputIdx) {
  1.4948 +                    // The match did make progress.  Repeat the loop.
  1.4949 +                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
  1.4950 +                    fp->fPatIdx = opValue;
  1.4951 +                    fp->fExtra[frameLoc] = fp->fInputIdx;
  1.4952 +                } 
  1.4953 +                // If the input position did not advance, we do nothing here,
  1.4954 +                //   execution will fall out of the loop.
  1.4955 +            }
  1.4956 +            break;
  1.4957 +            
  1.4958 +        case URX_CTR_INIT:
  1.4959 +            {
  1.4960 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  1.4961 +                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  1.4962 +                
  1.4963 +                // Pick up the three extra operands that CTR_INIT has, and
  1.4964 +                //    skip the pattern location counter past 
  1.4965 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.4966 +                fp->fPatIdx += 3;
  1.4967 +                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  1.4968 +                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  1.4969 +                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  1.4970 +                U_ASSERT(minCount>=0);
  1.4971 +                U_ASSERT(maxCount>=minCount || maxCount==-1);
  1.4972 +                U_ASSERT(loopLoc>=fp->fPatIdx);
  1.4973 +                
  1.4974 +                if (minCount == 0) {
  1.4975 +                    fp = StateSave(fp, loopLoc+1, status);
  1.4976 +                }
  1.4977 +                if (maxCount == -1) {
  1.4978 +                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
  1.4979 +                } else if (maxCount == 0) {
  1.4980 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.4981 +                }
  1.4982 +            }
  1.4983 +            break;
  1.4984 +            
  1.4985 +        case URX_CTR_LOOP:
  1.4986 +            {
  1.4987 +                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  1.4988 +                int32_t initOp = (int32_t)pat[opValue];
  1.4989 +                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
  1.4990 +                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  1.4991 +                int32_t minCount  = (int32_t)pat[opValue+2];
  1.4992 +                int32_t maxCount  = (int32_t)pat[opValue+3];
  1.4993 +                (*pCounter)++;
  1.4994 +                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  1.4995 +                    U_ASSERT(*pCounter == maxCount);
  1.4996 +                    break;
  1.4997 +                }
  1.4998 +                if (*pCounter >= minCount) {
  1.4999 +                    if (maxCount == -1) {
  1.5000 +                        // Loop has no hard upper bound.
  1.5001 +                        // Check that it is progressing through the input, break if it is not.
  1.5002 +                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  1.5003 +                        if (fp->fInputIdx == *pLastInputIdx) {
  1.5004 +                            break;
  1.5005 +                        } else {
  1.5006 +                            *pLastInputIdx = fp->fInputIdx;
  1.5007 +                        }
  1.5008 +                    }
  1.5009 +                    fp = StateSave(fp, fp->fPatIdx, status);
  1.5010 +                }
  1.5011 +                fp->fPatIdx = opValue + 4;    // Loop back.
  1.5012 +            }
  1.5013 +            break;
  1.5014 +            
  1.5015 +        case URX_CTR_INIT_NG:
  1.5016 +            {
  1.5017 +                // Initialize a non-greedy loop
  1.5018 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
  1.5019 +                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
  1.5020 +                
  1.5021 +                // Pick up the three extra operands that CTR_INIT_NG has, and
  1.5022 +                //    skip the pattern location counter past 
  1.5023 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.5024 +                fp->fPatIdx += 3;
  1.5025 +                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
  1.5026 +                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
  1.5027 +                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
  1.5028 +                U_ASSERT(minCount>=0);
  1.5029 +                U_ASSERT(maxCount>=minCount || maxCount==-1);
  1.5030 +                U_ASSERT(loopLoc>fp->fPatIdx);
  1.5031 +                if (maxCount == -1) {
  1.5032 +                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
  1.5033 +                }
  1.5034 +                
  1.5035 +                if (minCount == 0) {
  1.5036 +                    if (maxCount != 0) {
  1.5037 +                        fp = StateSave(fp, fp->fPatIdx, status);
  1.5038 +                    }
  1.5039 +                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
  1.5040 +                } 
  1.5041 +            }
  1.5042 +            break;
  1.5043 +            
  1.5044 +        case URX_CTR_LOOP_NG:
  1.5045 +            {
  1.5046 +                // Non-greedy {min, max} loops
  1.5047 +                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
  1.5048 +                int32_t initOp = (int32_t)pat[opValue];
  1.5049 +                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
  1.5050 +                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
  1.5051 +                int32_t minCount  = (int32_t)pat[opValue+2];
  1.5052 +                int32_t maxCount  = (int32_t)pat[opValue+3];
  1.5053 +
  1.5054 +                (*pCounter)++;
  1.5055 +                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
  1.5056 +                    // The loop has matched the maximum permitted number of times.
  1.5057 +                    //   Break out of here with no action.  Matching will
  1.5058 +                    //   continue with the following pattern.
  1.5059 +                    U_ASSERT(*pCounter == maxCount);
  1.5060 +                    break;
  1.5061 +                }
  1.5062 +                
  1.5063 +                if (*pCounter < minCount) {
  1.5064 +                    // We haven't met the minimum number of matches yet.
  1.5065 +                    //   Loop back for another one.
  1.5066 +                    fp->fPatIdx = opValue + 4;    // Loop back.
  1.5067 +                } else {
  1.5068 +                    // We do have the minimum number of matches.
  1.5069 +
  1.5070 +                    // If there is no upper bound on the loop iterations, check that the input index
  1.5071 +                    // is progressing, and stop the loop if it is not.
  1.5072 +                    if (maxCount == -1) {
  1.5073 +                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
  1.5074 +                        if (fp->fInputIdx == *pLastInputIdx) {
  1.5075 +                            break;
  1.5076 +                        }
  1.5077 +                        *pLastInputIdx = fp->fInputIdx;
  1.5078 +                    }
  1.5079 +
  1.5080 +                    // Loop Continuation: we will fall into the pattern following the loop
  1.5081 +                    //   (non-greedy, don't execute loop body first), but first do
  1.5082 +                    //   a state save to the top of the loop, so that a match failure
  1.5083 +                    //   in the following pattern will try another iteration of the loop.
  1.5084 +                    fp = StateSave(fp, opValue + 4, status);
  1.5085 +                }
  1.5086 +            }
  1.5087 +            break;
  1.5088 +            
  1.5089 +        case URX_STO_SP:
  1.5090 +            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  1.5091 +            fData[opValue] = fStack->size();
  1.5092 +            break;
  1.5093 +            
  1.5094 +        case URX_LD_SP:
  1.5095 +            {
  1.5096 +                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
  1.5097 +                int32_t newStackSize = (int32_t)fData[opValue];
  1.5098 +                U_ASSERT(newStackSize <= fStack->size());
  1.5099 +                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  1.5100 +                if (newFP == (int64_t *)fp) {
  1.5101 +                    break;
  1.5102 +                }
  1.5103 +                int32_t i;
  1.5104 +                for (i=0; i<fFrameSize; i++) {
  1.5105 +                    newFP[i] = ((int64_t *)fp)[i];
  1.5106 +                }
  1.5107 +                fp = (REStackFrame *)newFP;
  1.5108 +                fStack->setSize(newStackSize);
  1.5109 +            }
  1.5110 +            break;
  1.5111 +            
  1.5112 +        case URX_BACKREF:
  1.5113 +            {
  1.5114 +                U_ASSERT(opValue < fFrameSize);
  1.5115 +                int64_t groupStartIdx = fp->fExtra[opValue];
  1.5116 +                int64_t groupEndIdx   = fp->fExtra[opValue+1];
  1.5117 +                U_ASSERT(groupStartIdx <= groupEndIdx);
  1.5118 +                int64_t inputIndex = fp->fInputIdx;
  1.5119 +                if (groupStartIdx < 0) {
  1.5120 +                    // This capture group has not participated in the match thus far,
  1.5121 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  1.5122 +                    break;
  1.5123 +                }
  1.5124 +                UBool success = TRUE;
  1.5125 +                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
  1.5126 +                    if (inputIndex >= fActiveLimit) {
  1.5127 +                        success = FALSE;
  1.5128 +                        fHitEnd = TRUE;
  1.5129 +                        break;
  1.5130 +                    }
  1.5131 +                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
  1.5132 +                        success = FALSE;
  1.5133 +                        break;
  1.5134 +                    }
  1.5135 +                }
  1.5136 +                if (success) {
  1.5137 +                    fp->fInputIdx = inputIndex;
  1.5138 +                } else {
  1.5139 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5140 +                }
  1.5141 +            }
  1.5142 +            break;
  1.5143 +            
  1.5144 +        case URX_BACKREF_I:
  1.5145 +            {
  1.5146 +                U_ASSERT(opValue < fFrameSize);
  1.5147 +                int64_t groupStartIdx = fp->fExtra[opValue];
  1.5148 +                int64_t groupEndIdx   = fp->fExtra[opValue+1];
  1.5149 +                U_ASSERT(groupStartIdx <= groupEndIdx);
  1.5150 +                if (groupStartIdx < 0) {
  1.5151 +                    // This capture group has not participated in the match thus far,
  1.5152 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
  1.5153 +                    break;
  1.5154 +                }
  1.5155 +                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
  1.5156 +                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
  1.5157 +
  1.5158 +                //   Note: if the capture group match was of an empty string the backref
  1.5159 +                //         match succeeds.  Verified by testing:  Perl matches succeed 
  1.5160 +                //         in this case, so we do too.
  1.5161 +                
  1.5162 +                UBool success = TRUE;
  1.5163 +                for (;;) {
  1.5164 +                    UChar32 captureGroupChar = captureGroupItr.next();
  1.5165 +                    if (captureGroupChar == U_SENTINEL) {
  1.5166 +                        success = TRUE;
  1.5167 +                        break;
  1.5168 +                    }
  1.5169 +                    UChar32 inputChar = inputItr.next();
  1.5170 +                    if (inputChar == U_SENTINEL) {
  1.5171 +                        success = FALSE;
  1.5172 +                        fHitEnd = TRUE;
  1.5173 +                        break;
  1.5174 +                    }
  1.5175 +                    if (inputChar != captureGroupChar) {
  1.5176 +                        success = FALSE;
  1.5177 +                        break;
  1.5178 +                    }
  1.5179 +                }
  1.5180 +
  1.5181 +                if (success && inputItr.inExpansion()) {
  1.5182 +                    // We otained a match by consuming part of a string obtained from 
  1.5183 +                    // case-folding a single code point of the input text.  
  1.5184 +                    // This does not count as an overall match.
  1.5185 +                    success = FALSE;
  1.5186 +                }
  1.5187 +
  1.5188 +                if (success) {
  1.5189 +                    fp->fInputIdx = inputItr.getIndex();
  1.5190 +                } else {
  1.5191 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5192 +                }
  1.5193 +            }
  1.5194 +            break;
  1.5195 +
  1.5196 +        case URX_STO_INP_LOC:
  1.5197 +            {
  1.5198 +                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
  1.5199 +                fp->fExtra[opValue] = fp->fInputIdx;
  1.5200 +            }
  1.5201 +            break;
  1.5202 +            
  1.5203 +        case URX_JMPX:
  1.5204 +            {
  1.5205 +                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
  1.5206 +                fp->fPatIdx += 1;
  1.5207 +                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
  1.5208 +                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
  1.5209 +                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
  1.5210 +                U_ASSERT(savedInputIdx <= fp->fInputIdx);
  1.5211 +                if (savedInputIdx < fp->fInputIdx) {
  1.5212 +                    fp->fPatIdx = opValue;                               // JMP
  1.5213 +                } else {
  1.5214 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
  1.5215 +                }
  1.5216 +            }
  1.5217 +            break;
  1.5218 +            
  1.5219 +        case URX_LA_START:
  1.5220 +            {
  1.5221 +                // Entering a lookahead block.
  1.5222 +                // Save Stack Ptr, Input Pos.
  1.5223 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5224 +                fData[opValue]   = fStack->size();
  1.5225 +                fData[opValue+1] = fp->fInputIdx;
  1.5226 +                fActiveStart     = fLookStart;          // Set the match region change for
  1.5227 +                fActiveLimit     = fLookLimit;          //   transparent bounds.
  1.5228 +            }
  1.5229 +            break;
  1.5230 +            
  1.5231 +        case URX_LA_END:
  1.5232 +            {
  1.5233 +                // Leaving a look-ahead block.
  1.5234 +                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
  1.5235 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5236 +                int32_t stackSize = fStack->size();
  1.5237 +                int32_t newStackSize = (int32_t)fData[opValue];
  1.5238 +                U_ASSERT(stackSize >= newStackSize);
  1.5239 +                if (stackSize > newStackSize) {
  1.5240 +                    // Copy the current top frame back to the new (cut back) top frame.
  1.5241 +                    //   This makes the capture groups from within the look-ahead
  1.5242 +                    //   expression available.
  1.5243 +                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
  1.5244 +                    int32_t i;
  1.5245 +                    for (i=0; i<fFrameSize; i++) {
  1.5246 +                        newFP[i] = ((int64_t *)fp)[i];
  1.5247 +                    }
  1.5248 +                    fp = (REStackFrame *)newFP;
  1.5249 +                    fStack->setSize(newStackSize);
  1.5250 +                }
  1.5251 +                fp->fInputIdx = fData[opValue+1];
  1.5252 +                
  1.5253 +                // Restore the active region bounds in the input string; they may have
  1.5254 +                //    been changed because of transparent bounds on a Region.
  1.5255 +                fActiveStart = fRegionStart;
  1.5256 +                fActiveLimit = fRegionLimit;
  1.5257 +            }
  1.5258 +            break;
  1.5259 +            
  1.5260 +        case URX_ONECHAR_I:
  1.5261 +            if (fp->fInputIdx < fActiveLimit) {
  1.5262 +                UChar32 c; 
  1.5263 +                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
  1.5264 +                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
  1.5265 +                    break;
  1.5266 +                }
  1.5267 +            } else {
  1.5268 +                fHitEnd = TRUE;
  1.5269 +            }
  1.5270 +            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5271 +            break;
  1.5272 +            
  1.5273 +        case URX_STRING_I:
  1.5274 +            // Case-insensitive test input against a literal string.
  1.5275 +            // Strings require two slots in the compiled pattern, one for the
  1.5276 +            //   offset to the string text, and one for the length.
  1.5277 +            //   The compiled string has already been case folded.
  1.5278 +            {
  1.5279 +                const UChar *patternString = litText + opValue;
  1.5280 +
  1.5281 +                op      = (int32_t)pat[fp->fPatIdx];
  1.5282 +                fp->fPatIdx++;
  1.5283 +                opType  = URX_TYPE(op);
  1.5284 +                opValue = URX_VAL(op);
  1.5285 +                U_ASSERT(opType == URX_STRING_LEN);
  1.5286 +                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
  1.5287 +            
  1.5288 +                UChar32      cText;
  1.5289 +                UChar32      cPattern;
  1.5290 +                UBool        success = TRUE;
  1.5291 +                int32_t      patternStringIdx  = 0;
  1.5292 +                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
  1.5293 +                while (patternStringIdx < patternStringLen) {
  1.5294 +                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
  1.5295 +                    cText = inputIterator.next();
  1.5296 +                    if (cText != cPattern) {
  1.5297 +                        success = FALSE;
  1.5298 +                        if (cText == U_SENTINEL) {
  1.5299 +                            fHitEnd = TRUE;
  1.5300 +                        }
  1.5301 +                        break;
  1.5302 +                    }
  1.5303 +                }
  1.5304 +                if (inputIterator.inExpansion()) {
  1.5305 +                    success = FALSE;
  1.5306 +                }
  1.5307 +
  1.5308 +                if (success) {
  1.5309 +                    fp->fInputIdx = inputIterator.getIndex();
  1.5310 +                } else {
  1.5311 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5312 +                }
  1.5313 +            }
  1.5314 +            break;
  1.5315 +
  1.5316 +        case URX_LB_START:
  1.5317 +            {
  1.5318 +                // Entering a look-behind block.
  1.5319 +                // Save Stack Ptr, Input Pos.
  1.5320 +                //   TODO:  implement transparent bounds.  Ticket #6067
  1.5321 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5322 +                fData[opValue]   = fStack->size();
  1.5323 +                fData[opValue+1] = fp->fInputIdx;
  1.5324 +                // Init the variable containing the start index for attempted matches.
  1.5325 +                fData[opValue+2] = -1;
  1.5326 +                // Save input string length, then reset to pin any matches to end at
  1.5327 +                //   the current position.
  1.5328 +                fData[opValue+3] = fActiveLimit;
  1.5329 +                fActiveLimit     = fp->fInputIdx;
  1.5330 +            }
  1.5331 +            break;
  1.5332 +            
  1.5333 +            
  1.5334 +        case URX_LB_CONT:
  1.5335 +            {
  1.5336 +                // Positive Look-Behind, at top of loop checking for matches of LB expression
  1.5337 +                //    at all possible input starting positions.
  1.5338 +                
  1.5339 +                // Fetch the min and max possible match lengths.  They are the operands
  1.5340 +                //   of this op in the pattern.
  1.5341 +                int32_t minML = (int32_t)pat[fp->fPatIdx++];
  1.5342 +                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
  1.5343 +                U_ASSERT(minML <= maxML);
  1.5344 +                U_ASSERT(minML >= 0);
  1.5345 +                
  1.5346 +                // Fetch (from data) the last input index where a match was attempted.
  1.5347 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5348 +                int64_t  *lbStartIdx = &fData[opValue+2];
  1.5349 +                if (*lbStartIdx < 0) {
  1.5350 +                    // First time through loop.
  1.5351 +                    *lbStartIdx = fp->fInputIdx - minML;
  1.5352 +                } else {
  1.5353 +                    // 2nd through nth time through the loop.
  1.5354 +                    // Back up start position for match by one.
  1.5355 +                    if (*lbStartIdx == 0) {
  1.5356 +                        (*lbStartIdx)--;
  1.5357 +                    } else {
  1.5358 +                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
  1.5359 +                    }
  1.5360 +                }
  1.5361 +                
  1.5362 +                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  1.5363 +                    // We have tried all potential match starting points without
  1.5364 +                    //  getting a match.  Backtrack out, and out of the
  1.5365 +                    //   Look Behind altogether.
  1.5366 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5367 +                    int64_t restoreInputLen = fData[opValue+3];
  1.5368 +                    U_ASSERT(restoreInputLen >= fActiveLimit);
  1.5369 +                    U_ASSERT(restoreInputLen <= fInputLength);
  1.5370 +                    fActiveLimit = restoreInputLen;
  1.5371 +                    break;
  1.5372 +                }
  1.5373 +                
  1.5374 +                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  1.5375 +                //      (successful match will fall off the end of the loop.)
  1.5376 +                fp = StateSave(fp, fp->fPatIdx-3, status);
  1.5377 +                fp->fInputIdx =  *lbStartIdx;
  1.5378 +            }
  1.5379 +            break;
  1.5380 +            
  1.5381 +        case URX_LB_END:
  1.5382 +            // End of a look-behind block, after a successful match.
  1.5383 +            {
  1.5384 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5385 +                if (fp->fInputIdx != fActiveLimit) {
  1.5386 +                    //  The look-behind expression matched, but the match did not
  1.5387 +                    //    extend all the way to the point that we are looking behind from.
  1.5388 +                    //  FAIL out of here, which will take us back to the LB_CONT, which
  1.5389 +                    //     will retry the match starting at another position or fail
  1.5390 +                    //     the look-behind altogether, whichever is appropriate.
  1.5391 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5392 +                    break;
  1.5393 +                }
  1.5394 +                
  1.5395 +                // Look-behind match is good.  Restore the orignal input string length,
  1.5396 +                //   which had been truncated to pin the end of the lookbehind match to the 
  1.5397 +                //   position being looked-behind.
  1.5398 +                int64_t originalInputLen = fData[opValue+3];
  1.5399 +                U_ASSERT(originalInputLen >= fActiveLimit);
  1.5400 +                U_ASSERT(originalInputLen <= fInputLength);
  1.5401 +                fActiveLimit = originalInputLen;
  1.5402 +            }
  1.5403 +            break;
  1.5404 +            
  1.5405 +            
  1.5406 +        case URX_LBN_CONT:
  1.5407 +            {
  1.5408 +                // Negative Look-Behind, at top of loop checking for matches of LB expression
  1.5409 +                //    at all possible input starting positions.
  1.5410 +                
  1.5411 +                // Fetch the extra parameters of this op.
  1.5412 +                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
  1.5413 +                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
  1.5414 +                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
  1.5415 +                continueLoc = URX_VAL(continueLoc);
  1.5416 +                U_ASSERT(minML <= maxML);
  1.5417 +                U_ASSERT(minML >= 0);
  1.5418 +                U_ASSERT(continueLoc > fp->fPatIdx);
  1.5419 +                
  1.5420 +                // Fetch (from data) the last input index where a match was attempted.
  1.5421 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5422 +                int64_t  *lbStartIdx = &fData[opValue+2];
  1.5423 +                if (*lbStartIdx < 0) {
  1.5424 +                    // First time through loop.
  1.5425 +                    *lbStartIdx = fp->fInputIdx - minML;
  1.5426 +                } else {
  1.5427 +                    // 2nd through nth time through the loop.
  1.5428 +                    // Back up start position for match by one.
  1.5429 +                    if (*lbStartIdx == 0) {
  1.5430 +                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
  1.5431 +                    } else {
  1.5432 +                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
  1.5433 +                    }
  1.5434 +                }
  1.5435 +                
  1.5436 +                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
  1.5437 +                    // We have tried all potential match starting points without
  1.5438 +                    //  getting a match, which means that the negative lookbehind as
  1.5439 +                    //  a whole has succeeded.  Jump forward to the continue location
  1.5440 +                    int64_t restoreInputLen = fData[opValue+3];
  1.5441 +                    U_ASSERT(restoreInputLen >= fActiveLimit);
  1.5442 +                    U_ASSERT(restoreInputLen <= fInputLength);
  1.5443 +                    fActiveLimit = restoreInputLen;
  1.5444 +                    fp->fPatIdx = continueLoc;
  1.5445 +                    break;
  1.5446 +                }
  1.5447 +                
  1.5448 +                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
  1.5449 +                //      (successful match will cause a FAIL out of the loop altogether.)
  1.5450 +                fp = StateSave(fp, fp->fPatIdx-4, status);
  1.5451 +                fp->fInputIdx =  *lbStartIdx;
  1.5452 +            }
  1.5453 +            break;
  1.5454 +            
  1.5455 +        case URX_LBN_END:
  1.5456 +            // End of a negative look-behind block, after a successful match.
  1.5457 +            {
  1.5458 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5459 +                if (fp->fInputIdx != fActiveLimit) {
  1.5460 +                    //  The look-behind expression matched, but the match did not
  1.5461 +                    //    extend all the way to the point that we are looking behind from.
  1.5462 +                    //  FAIL out of here, which will take us back to the LB_CONT, which
  1.5463 +                    //     will retry the match starting at another position or succeed
  1.5464 +                    //     the look-behind altogether, whichever is appropriate.
  1.5465 +                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5466 +                    break;
  1.5467 +                }
  1.5468 +                
  1.5469 +                // Look-behind expression matched, which means look-behind test as
  1.5470 +                //   a whole Fails
  1.5471 +                
  1.5472 +                //   Restore the orignal input string length, which had been truncated 
  1.5473 +                //   inorder to pin the end of the lookbehind match  
  1.5474 +                //   to the position being looked-behind.
  1.5475 +                int64_t originalInputLen = fData[opValue+3];
  1.5476 +                U_ASSERT(originalInputLen >= fActiveLimit);
  1.5477 +                U_ASSERT(originalInputLen <= fInputLength);
  1.5478 +                fActiveLimit = originalInputLen;
  1.5479 +                
  1.5480 +                // Restore original stack position, discarding any state saved
  1.5481 +                //   by the successful pattern match.
  1.5482 +                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
  1.5483 +                int32_t newStackSize = (int32_t)fData[opValue];
  1.5484 +                U_ASSERT(fStack->size() > newStackSize);
  1.5485 +                fStack->setSize(newStackSize);
  1.5486 +                
  1.5487 +                //  FAIL, which will take control back to someplace 
  1.5488 +                //  prior to entering the look-behind test.
  1.5489 +                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
  1.5490 +            }
  1.5491 +            break;
  1.5492 +            
  1.5493 +            
  1.5494 +        case URX_LOOP_SR_I:
  1.5495 +            // Loop Initialization for the optimized implementation of
  1.5496 +            //     [some character set]*
  1.5497 +            //   This op scans through all matching input.
  1.5498 +            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  1.5499 +            {
  1.5500 +                U_ASSERT(opValue > 0 && opValue < sets->size());
  1.5501 +                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
  1.5502 +                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
  1.5503 +                
  1.5504 +                // Loop through input, until either the input is exhausted or
  1.5505 +                //   we reach a character that is not a member of the set.
  1.5506 +                int32_t ix = (int32_t)fp->fInputIdx;
  1.5507 +                for (;;) {
  1.5508 +                    if (ix >= fActiveLimit) {
  1.5509 +                        fHitEnd = TRUE;
  1.5510 +                        break;
  1.5511 +                    }
  1.5512 +                    UChar32   c;
  1.5513 +                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
  1.5514 +                    if (c<256) {
  1.5515 +                        if (s8->contains(c) == FALSE) {
  1.5516 +                            U16_BACK_1(inputBuf, 0, ix);
  1.5517 +                            break;
  1.5518 +                        }
  1.5519 +                    } else {
  1.5520 +                        if (s->contains(c) == FALSE) {
  1.5521 +                            U16_BACK_1(inputBuf, 0, ix);
  1.5522 +                            break;
  1.5523 +                        }
  1.5524 +                    }
  1.5525 +                }
  1.5526 +                
  1.5527 +                // If there were no matching characters, skip over the loop altogether.
  1.5528 +                //   The loop doesn't run at all, a * op always succeeds.
  1.5529 +                if (ix == fp->fInputIdx) {
  1.5530 +                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
  1.5531 +                    break;
  1.5532 +                }
  1.5533 +                
  1.5534 +                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  1.5535 +                //   must follow.  It's operand is the stack location
  1.5536 +                //   that holds the starting input index for the match of this [set]*
  1.5537 +                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  1.5538 +                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  1.5539 +                int32_t stackLoc = URX_VAL(loopcOp);
  1.5540 +                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  1.5541 +                fp->fExtra[stackLoc] = fp->fInputIdx;
  1.5542 +                fp->fInputIdx = ix;
  1.5543 +                
  1.5544 +                // Save State to the URX_LOOP_C op that follows this one,
  1.5545 +                //   so that match failures in the following code will return to there.
  1.5546 +                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  1.5547 +                fp = StateSave(fp, fp->fPatIdx, status);
  1.5548 +                fp->fPatIdx++;
  1.5549 +            }
  1.5550 +            break;
  1.5551 +            
  1.5552 +            
  1.5553 +        case URX_LOOP_DOT_I:
  1.5554 +            // Loop Initialization for the optimized implementation of .*
  1.5555 +            //   This op scans through all remaining input.
  1.5556 +            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
  1.5557 +            {
  1.5558 +                // Loop through input until the input is exhausted (we reach an end-of-line)
  1.5559 +                // In DOTALL mode, we can just go straight to the end of the input.
  1.5560 +                int32_t ix;
  1.5561 +                if ((opValue & 1) == 1) {
  1.5562 +                    // Dot-matches-All mode.  Jump straight to the end of the string.
  1.5563 +                    ix = (int32_t)fActiveLimit;
  1.5564 +                    fHitEnd = TRUE;
  1.5565 +                } else {
  1.5566 +                    // NOT DOT ALL mode.  Line endings do not match '.'
  1.5567 +                    // Scan forward until a line ending or end of input.
  1.5568 +                    ix = (int32_t)fp->fInputIdx;
  1.5569 +                    for (;;) {
  1.5570 +                        if (ix >= fActiveLimit) {
  1.5571 +                            fHitEnd = TRUE;
  1.5572 +                            break;
  1.5573 +                        }
  1.5574 +                        UChar32   c;
  1.5575 +                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
  1.5576 +                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
  1.5577 +                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
  1.5578 +                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
  1.5579 +                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
  1.5580 +                                //  char is a line ending.  Put the input pos back to the
  1.5581 +                                //    line ending char, and exit the scanning loop.
  1.5582 +                                U16_BACK_1(inputBuf, 0, ix);
  1.5583 +                                break;
  1.5584 +                            }
  1.5585 +                        }
  1.5586 +                    }
  1.5587 +                }
  1.5588 +                
  1.5589 +                // If there were no matching characters, skip over the loop altogether.
  1.5590 +                //   The loop doesn't run at all, a * op always succeeds.
  1.5591 +                if (ix == fp->fInputIdx) {
  1.5592 +                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
  1.5593 +                    break;
  1.5594 +                }
  1.5595 +                
  1.5596 +                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
  1.5597 +                //   must follow.  It's operand is the stack location
  1.5598 +                //   that holds the starting input index for the match of this .*
  1.5599 +                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
  1.5600 +                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
  1.5601 +                int32_t stackLoc = URX_VAL(loopcOp);
  1.5602 +                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
  1.5603 +                fp->fExtra[stackLoc] = fp->fInputIdx;
  1.5604 +                fp->fInputIdx = ix;
  1.5605 +                
  1.5606 +                // Save State to the URX_LOOP_C op that follows this one,
  1.5607 +                //   so that match failures in the following code will return to there.
  1.5608 +                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
  1.5609 +                fp = StateSave(fp, fp->fPatIdx, status);
  1.5610 +                fp->fPatIdx++;
  1.5611 +            }
  1.5612 +            break;
  1.5613 +            
  1.5614 +            
  1.5615 +        case URX_LOOP_C:
  1.5616 +            {
  1.5617 +                U_ASSERT(opValue>=0 && opValue<fFrameSize);
  1.5618 +                backSearchIndex = (int32_t)fp->fExtra[opValue];
  1.5619 +                U_ASSERT(backSearchIndex <= fp->fInputIdx);
  1.5620 +                if (backSearchIndex == fp->fInputIdx) {
  1.5621 +                    // We've backed up the input idx to the point that the loop started.
  1.5622 +                    // The loop is done.  Leave here without saving state.   
  1.5623 +                    //  Subsequent failures won't come back here.
  1.5624 +                    break;
  1.5625 +                }
  1.5626 +                // Set up for the next iteration of the loop, with input index
  1.5627 +                //   backed up by one from the last time through,
  1.5628 +                //   and a state save to this instruction in case the following code fails again.
  1.5629 +                //   (We're going backwards because this loop emulates stack unwinding, not
  1.5630 +                //    the initial scan forward.)
  1.5631 +                U_ASSERT(fp->fInputIdx > 0);
  1.5632 +                UChar32 prevC;
  1.5633 +                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
  1.5634 +                
  1.5635 +                if (prevC == 0x0a && 
  1.5636 +                    fp->fInputIdx > backSearchIndex &&
  1.5637 +                    inputBuf[fp->fInputIdx-1] == 0x0d) {
  1.5638 +                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
  1.5639 +                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
  1.5640 +                        // .*, stepping back over CRLF pair.
  1.5641 +                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
  1.5642 +                    }
  1.5643 +                }
  1.5644 +                
  1.5645 +                
  1.5646 +                fp = StateSave(fp, fp->fPatIdx-1, status);
  1.5647 +            }
  1.5648 +            break;
  1.5649 +            
  1.5650 +            
  1.5651 +            
  1.5652 +        default:
  1.5653 +            // Trouble.  The compiled pattern contains an entry with an
  1.5654 +            //           unrecognized type tag.
  1.5655 +            U_ASSERT(FALSE);
  1.5656 +        }
  1.5657 +        
  1.5658 +        if (U_FAILURE(status)) {
  1.5659 +            isMatch = FALSE;
  1.5660 +            break;
  1.5661 +        }
  1.5662 +    }
  1.5663 +    
  1.5664 +breakFromLoop:
  1.5665 +    fMatch = isMatch;
  1.5666 +    if (isMatch) {
  1.5667 +        fLastMatchEnd = fMatchEnd;
  1.5668 +        fMatchStart   = startIdx;
  1.5669 +        fMatchEnd     = fp->fInputIdx;
  1.5670 +        if (fTraceDebug) {
  1.5671 +            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
  1.5672 +        }
  1.5673 +    }
  1.5674 +    else
  1.5675 +    {
  1.5676 +        if (fTraceDebug) {
  1.5677 +            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
  1.5678 +        }
  1.5679 +    }
  1.5680 +    
  1.5681 +    fFrame = fp;                // The active stack frame when the engine stopped.
  1.5682 +    //   Contains the capture group results that we need to
  1.5683 +    //    access later.
  1.5684 +
  1.5685 +    return;
  1.5686 +}
  1.5687 +
  1.5688 +
  1.5689 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
  1.5690 +
  1.5691 +U_NAMESPACE_END
  1.5692 +
  1.5693 +#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial