intl/icu/source/i18n/regexcmp.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/regexcmp.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,4340 @@
     1.4 +//
     1.5 +//  file:  regexcmp.cpp
     1.6 +//
     1.7 +//  Copyright (C) 2002-2013 International Business Machines Corporation and others.
     1.8 +//  All Rights Reserved.
     1.9 +//
    1.10 +//  This file contains the ICU regular expression compiler, which is responsible
    1.11 +//  for processing a regular expression pattern into the compiled form that
    1.12 +//  is used by the match finding engine.
    1.13 +//
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +
    1.17 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.18 +
    1.19 +#include "unicode/ustring.h"
    1.20 +#include "unicode/unistr.h"
    1.21 +#include "unicode/uniset.h"
    1.22 +#include "unicode/uchar.h"
    1.23 +#include "unicode/uchriter.h"
    1.24 +#include "unicode/parsepos.h"
    1.25 +#include "unicode/parseerr.h"
    1.26 +#include "unicode/regex.h"
    1.27 +#include "unicode/utf.h"
    1.28 +#include "unicode/utf16.h"
    1.29 +#include "patternprops.h"
    1.30 +#include "putilimp.h"
    1.31 +#include "cmemory.h"
    1.32 +#include "cstring.h"
    1.33 +#include "uvectr32.h"
    1.34 +#include "uvectr64.h"
    1.35 +#include "uassert.h"
    1.36 +#include "ucln_in.h"
    1.37 +#include "uinvchar.h"
    1.38 +
    1.39 +#include "regeximp.h"
    1.40 +#include "regexcst.h"   // Contains state table for the regex pattern parser.
    1.41 +                        //   generated by a Perl script.
    1.42 +#include "regexcmp.h"
    1.43 +#include "regexst.h"
    1.44 +#include "regextxt.h"
    1.45 +
    1.46 +
    1.47 +
    1.48 +U_NAMESPACE_BEGIN
    1.49 +
    1.50 +
    1.51 +//------------------------------------------------------------------------------
    1.52 +//
    1.53 +//  Constructor.
    1.54 +//
    1.55 +//------------------------------------------------------------------------------
    1.56 +RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
    1.57 +   fParenStack(status), fSetStack(status), fSetOpStack(status)
    1.58 +{
    1.59 +    // Lazy init of all shared global sets (needed for init()'s empty text)
    1.60 +    RegexStaticSets::initGlobals(&status);
    1.61 +
    1.62 +    fStatus           = &status;
    1.63 +
    1.64 +    fRXPat            = rxp;
    1.65 +    fScanIndex        = 0;
    1.66 +    fLastChar         = -1;
    1.67 +    fPeekChar         = -1;
    1.68 +    fLineNum          = 1;
    1.69 +    fCharNum          = 0;
    1.70 +    fQuoteMode        = FALSE;
    1.71 +    fInBackslashQuote = FALSE;
    1.72 +    fModeFlags        = fRXPat->fFlags | 0x80000000;
    1.73 +    fEOLComments      = TRUE;
    1.74 +
    1.75 +    fMatchOpenParen   = -1;
    1.76 +    fMatchCloseParen  = -1;
    1.77 +
    1.78 +    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
    1.79 +        status = rxp->fDeferredStatus;
    1.80 +    }
    1.81 +}
    1.82 +
    1.83 +static const UChar      chAmp       = 0x26;      // '&'
    1.84 +static const UChar      chDash      = 0x2d;      // '-'
    1.85 +
    1.86 +
    1.87 +//------------------------------------------------------------------------------
    1.88 +//
    1.89 +//  Destructor
    1.90 +//
    1.91 +//------------------------------------------------------------------------------
    1.92 +RegexCompile::~RegexCompile() {
    1.93 +}
    1.94 +
    1.95 +static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
    1.96 +    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
    1.97 +}
    1.98 +
    1.99 +//------------------------------------------------------------------------------
   1.100 +//
   1.101 +//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
   1.102 +//                           The state tables are hand-written in the file regexcst.txt,
   1.103 +//                           and converted to the form used here by a perl
   1.104 +//                           script regexcst.pl
   1.105 +//
   1.106 +//------------------------------------------------------------------------------
   1.107 +void    RegexCompile::compile(
   1.108 +                         const UnicodeString &pat,   // Source pat to be compiled.
   1.109 +                         UParseError &pp,            // Error position info
   1.110 +                         UErrorCode &e)              // Error Code
   1.111 +{
   1.112 +    fRXPat->fPatternString = new UnicodeString(pat);
   1.113 +    UText patternText = UTEXT_INITIALIZER;
   1.114 +    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
   1.115 +    
   1.116 +    if (U_SUCCESS(e)) {
   1.117 +        compile(&patternText, pp, e);
   1.118 +        utext_close(&patternText);
   1.119 +    }
   1.120 +}
   1.121 +
   1.122 +//
   1.123 +//   compile, UText mode
   1.124 +//     All the work is actually done here.
   1.125 +//
   1.126 +void    RegexCompile::compile(
   1.127 +                         UText *pat,                 // Source pat to be compiled.
   1.128 +                         UParseError &pp,            // Error position info
   1.129 +                         UErrorCode &e)              // Error Code
   1.130 +{
   1.131 +    fStatus             = &e;
   1.132 +    fParseErr           = &pp;
   1.133 +    fStackPtr           = 0;
   1.134 +    fStack[fStackPtr]   = 0;
   1.135 +
   1.136 +    if (U_FAILURE(*fStatus)) {
   1.137 +        return;
   1.138 +    }
   1.139 +
   1.140 +    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
   1.141 +    U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
   1.142 +
   1.143 +    // Prepare the RegexPattern object to receive the compiled pattern.
   1.144 +    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
   1.145 +    fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
   1.146 +    fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
   1.147 +
   1.148 +
   1.149 +    // Initialize the pattern scanning state machine
   1.150 +    fPatternLength = utext_nativeLength(pat);
   1.151 +    uint16_t                state = 1;
   1.152 +    const RegexTableEl      *tableEl;
   1.153 +
   1.154 +    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
   1.155 +    if (fModeFlags & UREGEX_LITERAL) {
   1.156 +        fQuoteMode = TRUE;
   1.157 +    }
   1.158 +
   1.159 +    nextChar(fC);                        // Fetch the first char from the pattern string.
   1.160 +
   1.161 +    //
   1.162 +    // Main loop for the regex pattern parsing state machine.
   1.163 +    //   Runs once per state transition.
   1.164 +    //   Each time through optionally performs, depending on the state table,
   1.165 +    //      - an advance to the the next pattern char
   1.166 +    //      - an action to be performed.
   1.167 +    //      - pushing or popping a state to/from the local state return stack.
   1.168 +    //   file regexcst.txt is the source for the state table.  The logic behind
   1.169 +    //     recongizing the pattern syntax is there, not here.
   1.170 +    //
   1.171 +    for (;;) {
   1.172 +        //  Bail out if anything has gone wrong.
   1.173 +        //  Regex pattern parsing stops on the first error encountered.
   1.174 +        if (U_FAILURE(*fStatus)) {
   1.175 +            break;
   1.176 +        }
   1.177 +
   1.178 +        U_ASSERT(state != 0);
   1.179 +
   1.180 +        // Find the state table element that matches the input char from the pattern, or the
   1.181 +        //    class of the input character.  Start with the first table row for this
   1.182 +        //    state, then linearly scan forward until we find a row that matches the
   1.183 +        //    character.  The last row for each state always matches all characters, so
   1.184 +        //    the search will stop there, if not before.
   1.185 +        //
   1.186 +        tableEl = &gRuleParseStateTable[state];
   1.187 +        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
   1.188 +            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
   1.189 +
   1.190 +        for (;;) {    // loop through table rows belonging to this state, looking for one
   1.191 +                      //   that matches the current input char.
   1.192 +            REGEX_SCAN_DEBUG_PRINTF(("."));
   1.193 +            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
   1.194 +                // Table row specified an individual character, not a set, and
   1.195 +                //   the input character is not quoted, and
   1.196 +                //   the input character matched it.
   1.197 +                break;
   1.198 +            }
   1.199 +            if (tableEl->fCharClass == 255) {
   1.200 +                // Table row specified default, match anything character class.
   1.201 +                break;
   1.202 +            }
   1.203 +            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
   1.204 +                // Table row specified "quoted" and the char was quoted.
   1.205 +                break;
   1.206 +            }
   1.207 +            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
   1.208 +                // Table row specified eof and we hit eof on the input.
   1.209 +                break;
   1.210 +            }
   1.211 +
   1.212 +            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
   1.213 +                fC.fQuoted == FALSE &&                                       //   char is not escaped &&
   1.214 +                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
   1.215 +                U_ASSERT(tableEl->fCharClass <= 137);
   1.216 +                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
   1.217 +                    // Table row specified a character class, or set of characters,
   1.218 +                    //   and the current char matches it.
   1.219 +                    break;
   1.220 +                }
   1.221 +            }
   1.222 +
   1.223 +            // No match on this row, advance to the next  row for this state,
   1.224 +            tableEl++;
   1.225 +        }
   1.226 +        REGEX_SCAN_DEBUG_PRINTF(("\n"));
   1.227 +
   1.228 +        //
   1.229 +        // We've found the row of the state table that matches the current input
   1.230 +        //   character from the rules string.
   1.231 +        // Perform any action specified  by this row in the state table.
   1.232 +        if (doParseActions(tableEl->fAction) == FALSE) {
   1.233 +            // Break out of the state machine loop if the
   1.234 +            //   the action signalled some kind of error, or
   1.235 +            //   the action was to exit, occurs on normal end-of-rules-input.
   1.236 +            break;
   1.237 +        }
   1.238 +
   1.239 +        if (tableEl->fPushState != 0) {
   1.240 +            fStackPtr++;
   1.241 +            if (fStackPtr >= kStackSize) {
   1.242 +                error(U_REGEX_INTERNAL_ERROR);
   1.243 +                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
   1.244 +                fStackPtr--;
   1.245 +            }
   1.246 +            fStack[fStackPtr] = tableEl->fPushState;
   1.247 +        }
   1.248 +
   1.249 +        //
   1.250 +        //  NextChar.  This is where characters are actually fetched from the pattern.
   1.251 +        //             Happens under control of the 'n' tag in the state table.
   1.252 +        //
   1.253 +        if (tableEl->fNextChar) {
   1.254 +            nextChar(fC);
   1.255 +        }
   1.256 +
   1.257 +        // Get the next state from the table entry, or from the
   1.258 +        //   state stack if the next state was specified as "pop".
   1.259 +        if (tableEl->fNextState != 255) {
   1.260 +            state = tableEl->fNextState;
   1.261 +        } else {
   1.262 +            state = fStack[fStackPtr];
   1.263 +            fStackPtr--;
   1.264 +            if (fStackPtr < 0) {
   1.265 +                // state stack underflow
   1.266 +                // This will occur if the user pattern has mis-matched parentheses,
   1.267 +                //   with extra close parens.
   1.268 +                //
   1.269 +                fStackPtr++;
   1.270 +                error(U_REGEX_MISMATCHED_PAREN);
   1.271 +            }
   1.272 +        }
   1.273 +
   1.274 +    }
   1.275 +
   1.276 +    if (U_FAILURE(*fStatus)) {
   1.277 +        // Bail out if the pattern had errors.
   1.278 +        //   Set stack cleanup:  a successful compile would have left it empty,
   1.279 +        //   but errors can leave temporary sets hanging around.
   1.280 +        while (!fSetStack.empty()) {
   1.281 +            delete (UnicodeSet *)fSetStack.pop();
   1.282 +        }
   1.283 +        return;
   1.284 +    }
   1.285 +
   1.286 +    //
   1.287 +    // The pattern has now been read and processed, and the compiled code generated.
   1.288 +    //
   1.289 +
   1.290 +    //
   1.291 +    // Compute the number of digits requried for the largest capture group number.
   1.292 +    //
   1.293 +    fRXPat->fMaxCaptureDigits = 1;
   1.294 +    int32_t  n = 10;
   1.295 +    int32_t  groupCount = fRXPat->fGroupMap->size();
   1.296 +    while (n <= groupCount) {
   1.297 +        fRXPat->fMaxCaptureDigits++;
   1.298 +        n *= 10;
   1.299 +    }
   1.300 +
   1.301 +    //
   1.302 +    // The pattern's fFrameSize so far has accumulated the requirements for
   1.303 +    //   storage for capture parentheses, counters, etc. that are encountered
   1.304 +    //   in the pattern.  Add space for the two variables that are always
   1.305 +    //   present in the saved state:  the input string position (int64_t) and
   1.306 +    //   the position in the compiled pattern.
   1.307 +    //
   1.308 +    fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
   1.309 +
   1.310 +    //
   1.311 +    // Optimization pass 1: NOPs, back-references, and case-folding
   1.312 +    //
   1.313 +    stripNOPs();
   1.314 +
   1.315 +    //
   1.316 +    // Get bounds for the minimum and maximum length of a string that this
   1.317 +    //   pattern can match.  Used to avoid looking for matches in strings that
   1.318 +    //   are too short.
   1.319 +    //
   1.320 +    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
   1.321 +
   1.322 +    //
   1.323 +    // Optimization pass 2: match start type
   1.324 +    //
   1.325 +    matchStartType();
   1.326 +
   1.327 +    //
   1.328 +    // Set up fast latin-1 range sets
   1.329 +    //
   1.330 +    int32_t numSets = fRXPat->fSets->size();
   1.331 +    fRXPat->fSets8 = new Regex8BitSet[numSets];
   1.332 +    // Null pointer check.
   1.333 +    if (fRXPat->fSets8 == NULL) {
   1.334 +        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.335 +        return;
   1.336 +    }
   1.337 +    int32_t i;
   1.338 +    for (i=0; i<numSets; i++) {
   1.339 +        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
   1.340 +        fRXPat->fSets8[i].init(s);
   1.341 +    }
   1.342 +
   1.343 +}
   1.344 +
   1.345 +
   1.346 +
   1.347 +
   1.348 +
   1.349 +//------------------------------------------------------------------------------
   1.350 +//
   1.351 +//  doParseAction        Do some action during regex pattern parsing.
   1.352 +//                       Called by the parse state machine.
   1.353 +//
   1.354 +//                       Generation of the match engine PCode happens here, or
   1.355 +//                       in functions called from the parse actions defined here.
   1.356 +//
   1.357 +//
   1.358 +//------------------------------------------------------------------------------
   1.359 +UBool RegexCompile::doParseActions(int32_t action)
   1.360 +{
   1.361 +    UBool   returnVal = TRUE;
   1.362 +
   1.363 +    switch ((Regex_PatternParseAction)action) {
   1.364 +
   1.365 +    case doPatStart:
   1.366 +        // Start of pattern compiles to:
   1.367 +        //0   SAVE   2        Fall back to position of FAIL
   1.368 +        //1   jmp    3
   1.369 +        //2   FAIL            Stop if we ever reach here.
   1.370 +        //3   NOP             Dummy, so start of pattern looks the same as
   1.371 +        //                    the start of an ( grouping.
   1.372 +        //4   NOP             Resreved, will be replaced by a save if there are
   1.373 +        //                    OR | operators at the top level
   1.374 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
   1.375 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
   1.376 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
   1.377 +
   1.378 +        // Standard open nonCapture paren action emits the two NOPs and
   1.379 +        //   sets up the paren stack frame.
   1.380 +        doParseActions(doOpenNonCaptureParen);
   1.381 +        break;
   1.382 +
   1.383 +    case doPatFinish:
   1.384 +        // We've scanned to the end of the pattern
   1.385 +        //  The end of pattern compiles to:
   1.386 +        //        URX_END
   1.387 +        //    which will stop the runtime match engine.
   1.388 +        //  Encountering end of pattern also behaves like a close paren,
   1.389 +        //   and forces fixups of the State Save at the beginning of the compiled pattern
   1.390 +        //   and of any OR operations at the top level.
   1.391 +        //
   1.392 +        handleCloseParen();
   1.393 +        if (fParenStack.size() > 0) {
   1.394 +            // Missing close paren in pattern.
   1.395 +            error(U_REGEX_MISMATCHED_PAREN);
   1.396 +        }
   1.397 +
   1.398 +        // add the END operation to the compiled pattern.
   1.399 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
   1.400 +
   1.401 +        // Terminate the pattern compilation state machine.
   1.402 +        returnVal = FALSE;
   1.403 +        break;
   1.404 +
   1.405 +
   1.406 +
   1.407 +    case doOrOperator:
   1.408 +        // Scanning a '|', as in (A|B)
   1.409 +        {
   1.410 +            // Generate code for any pending literals preceding the '|'
   1.411 +            fixLiterals(FALSE);
   1.412 +
   1.413 +            // Insert a SAVE operation at the start of the pattern section preceding
   1.414 +            //   this OR at this level.  This SAVE will branch the match forward
   1.415 +            //   to the right hand side of the OR in the event that the left hand
   1.416 +            //   side fails to match and backtracks.  Locate the position for the
   1.417 +            //   save from the location on the top of the parentheses stack.
   1.418 +            int32_t savePosition = fParenStack.popi();
   1.419 +            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
   1.420 +            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
   1.421 +            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
   1.422 +            fRXPat->fCompiledPat->setElementAt(op, savePosition);
   1.423 +
   1.424 +            // Append an JMP operation into the compiled pattern.  The operand for
   1.425 +            //  the JMP will eventually be the location following the ')' for the
   1.426 +            //  group.  This will be patched in later, when the ')' is encountered.
   1.427 +            op = URX_BUILD(URX_JMP, 0);
   1.428 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.429 +
   1.430 +            // Push the position of the newly added JMP op onto the parentheses stack.
   1.431 +            // This registers if for fixup when this block's close paren is encountered.
   1.432 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
   1.433 +
   1.434 +            // Append a NOP to the compiled pattern.  This is the slot reserved
   1.435 +            //   for a SAVE in the event that there is yet another '|' following
   1.436 +            //   this one.
   1.437 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.438 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
   1.439 +        }
   1.440 +        break;
   1.441 +
   1.442 +
   1.443 +    case doOpenCaptureParen:
   1.444 +        // Open Paren.
   1.445 +        //   Compile to a
   1.446 +        //      - NOP, which later may be replaced by a save-state if the
   1.447 +        //         parenthesized group gets a * quantifier, followed by
   1.448 +        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
   1.449 +        //      - NOP, which may later be replaced by a save-state if there
   1.450 +        //             is an '|' alternation within the parens.
   1.451 +        //
   1.452 +        //    Each capture group gets three slots in the save stack frame:
   1.453 +        //         0: Capture Group start position (in input string being matched.)
   1.454 +        //         1: Capture Group end position.
   1.455 +        //         2: Start of Match-in-progress.
   1.456 +        //    The first two locations are for a completed capture group, and are
   1.457 +        //     referred to by back references and the like.
   1.458 +        //    The third location stores the capture start position when an START_CAPTURE is
   1.459 +        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
   1.460 +        //      END_CAPTURE is encountered.
   1.461 +        {
   1.462 +            fixLiterals();
   1.463 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.464 +            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
   1.465 +            fRXPat->fFrameSize += 3;
   1.466 +            int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
   1.467 +            fRXPat->fCompiledPat->addElement(cop, *fStatus);
   1.468 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.469 +
   1.470 +            // On the Parentheses stack, start a new frame and add the postions
   1.471 +            //   of the two NOPs.  Depending on what follows in the pattern, the
   1.472 +            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
   1.473 +            //   address of the end of the parenthesized group.
   1.474 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.475 +            fParenStack.push(capturing, *fStatus);                        // Frame type.
   1.476 +            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
   1.477 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
   1.478 +
   1.479 +            // Save the mapping from group number to stack frame variable position.
   1.480 +            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
   1.481 +        }
   1.482 +         break;
   1.483 +
   1.484 +    case doOpenNonCaptureParen:
   1.485 +        // Open non-caputuring (grouping only) Paren.
   1.486 +        //   Compile to a
   1.487 +        //      - NOP, which later may be replaced by a save-state if the
   1.488 +        //         parenthesized group gets a * quantifier, followed by
   1.489 +        //      - NOP, which may later be replaced by a save-state if there
   1.490 +        //             is an '|' alternation within the parens.
   1.491 +        {
   1.492 +            fixLiterals();
   1.493 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.494 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.495 +
   1.496 +            // On the Parentheses stack, start a new frame and add the postions
   1.497 +            //   of the two NOPs.
   1.498 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.499 +            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
   1.500 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
   1.501 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
   1.502 +        }
   1.503 +         break;
   1.504 +
   1.505 +
   1.506 +    case doOpenAtomicParen:
   1.507 +        // Open Atomic Paren.  (?>
   1.508 +        //   Compile to a
   1.509 +        //      - NOP, which later may be replaced if the parenthesized group
   1.510 +        //         has a quantifier, followed by
   1.511 +        //      - STO_SP  save state stack position, so it can be restored at the ")"
   1.512 +        //      - NOP, which may later be replaced by a save-state if there
   1.513 +        //             is an '|' alternation within the parens.
   1.514 +        {
   1.515 +            fixLiterals();
   1.516 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.517 +            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
   1.518 +            fRXPat->fDataSize += 1;                    //  state stack ptr.
   1.519 +            int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc);
   1.520 +            fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
   1.521 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1.522 +
   1.523 +            // On the Parentheses stack, start a new frame and add the postions
   1.524 +            //   of the two NOPs.  Depending on what follows in the pattern, the
   1.525 +            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
   1.526 +            //   address of the end of the parenthesized group.
   1.527 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.528 +            fParenStack.push(atomic, *fStatus);                           // Frame type.
   1.529 +            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
   1.530 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1.531 +        }
   1.532 +        break;
   1.533 +
   1.534 +
   1.535 +    case doOpenLookAhead:
   1.536 +        // Positive Look-ahead   (?=  stuff  )
   1.537 +        //
   1.538 +        //   Note:   Addition of transparent input regions, with the need to
   1.539 +        //           restore the original regions when failing out of a lookahead
   1.540 +        //           block, complicated this sequence.  Some conbined opcodes
   1.541 +        //           might make sense - or might not, lookahead aren't that common.
   1.542 +        //
   1.543 +        //      Caution:  min match length optimization knows about this
   1.544 +        //               sequence; don't change without making updates there too.
   1.545 +        //
   1.546 +        // Compiles to
   1.547 +        //    1    START_LA     dataLoc     Saves SP, Input Pos
   1.548 +        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
   1.549 +        //    3    JMP          6           continue ...
   1.550 +        //
   1.551 +        //    4.   LA_END                   Look Ahead failed.  Restore regions.
   1.552 +        //    5.   BACKTRACK                and back track again.
   1.553 +        //
   1.554 +        //    6.   NOP              reserved for use by quantifiers on the block.
   1.555 +        //                          Look-ahead can't have quantifiers, but paren stack
   1.556 +        //                             compile time conventions require the slot anyhow.
   1.557 +        //    7.   NOP              may be replaced if there is are '|' ops in the block.
   1.558 +        //    8.     code for parenthesized stuff.
   1.559 +        //    9.   LA_END
   1.560 +        //
   1.561 +        //  Two data slots are reserved, for saving the stack ptr and the input position.
   1.562 +        {
   1.563 +            fixLiterals();
   1.564 +            int32_t dataLoc = fRXPat->fDataSize;
   1.565 +            fRXPat->fDataSize += 2;
   1.566 +            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
   1.567 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.568 +
   1.569 +            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
   1.570 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.571 +
   1.572 +            op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
   1.573 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.574 +            
   1.575 +            op = URX_BUILD(URX_LA_END, dataLoc);
   1.576 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.577 +
   1.578 +            op = URX_BUILD(URX_BACKTRACK, 0);
   1.579 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.580 +            
   1.581 +            op = URX_BUILD(URX_NOP, 0);
   1.582 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.583 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.584 +
   1.585 +            // On the Parentheses stack, start a new frame and add the postions
   1.586 +            //   of the NOPs.
   1.587 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.588 +            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
   1.589 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
   1.590 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
   1.591 +        }
   1.592 +        break;
   1.593 +
   1.594 +    case doOpenLookAheadNeg:
   1.595 +        // Negated Lookahead.   (?! stuff )
   1.596 +        // Compiles to
   1.597 +        //    1.    START_LA    dataloc
   1.598 +        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
   1.599 +        //                                //   which continues with the match.
   1.600 +        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
   1.601 +        //    4.       code for parenthesized stuff.
   1.602 +        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
   1.603 +        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
   1.604 +        //    7.    END_LA                // Restore match region, in case look-ahead was using
   1.605 +        //                                        an alternate (transparent) region.
   1.606 +        {
   1.607 +            fixLiterals();
   1.608 +            int32_t dataLoc = fRXPat->fDataSize;
   1.609 +            fRXPat->fDataSize += 2;
   1.610 +            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
   1.611 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.612 +
   1.613 +            op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
   1.614 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.615 +
   1.616 +            op = URX_BUILD(URX_NOP, 0);
   1.617 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.618 +
   1.619 +            // On the Parentheses stack, start a new frame and add the postions
   1.620 +            //   of the StateSave and NOP.
   1.621 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.622 +            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
   1.623 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
   1.624 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
   1.625 +
   1.626 +            // Instructions #5 - #7 will be added when the ')' is encountered.
   1.627 +        }
   1.628 +        break;
   1.629 +
   1.630 +    case doOpenLookBehind:
   1.631 +        {
   1.632 +            //   Compile a (?<= look-behind open paren.
   1.633 +            //
   1.634 +            //          Compiles to
   1.635 +            //              0       URX_LB_START     dataLoc
   1.636 +            //              1       URX_LB_CONT      dataLoc
   1.637 +            //              2                        MinMatchLen
   1.638 +            //              3                        MaxMatchLen
   1.639 +            //              4       URX_NOP          Standard '(' boilerplate.
   1.640 +            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
   1.641 +            //              6         <code for LookBehind expression>
   1.642 +            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
   1.643 +            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
   1.644 +            //
   1.645 +            //          Allocate a block of matcher data, to contain (when running a match)
   1.646 +            //              0:    Stack ptr on entry
   1.647 +            //              1:    Input Index on entry
   1.648 +            //              2:    Start index of match current match attempt.
   1.649 +            //              3:    Original Input String len.
   1.650 +
   1.651 +            // Generate match code for any pending literals.
   1.652 +            fixLiterals();
   1.653 +
   1.654 +            // Allocate data space
   1.655 +            int32_t dataLoc = fRXPat->fDataSize;
   1.656 +            fRXPat->fDataSize += 4;
   1.657 +
   1.658 +            // Emit URX_LB_START
   1.659 +            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
   1.660 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.661 +
   1.662 +            // Emit URX_LB_CONT
   1.663 +            op = URX_BUILD(URX_LB_CONT, dataLoc);
   1.664 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.665 +            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
   1.666 +            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
   1.667 +
   1.668 +            // Emit the NOP
   1.669 +            op = URX_BUILD(URX_NOP, 0);
   1.670 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.671 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.672 +
   1.673 +            // On the Parentheses stack, start a new frame and add the postions
   1.674 +            //   of the URX_LB_CONT and the NOP.
   1.675 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.676 +            fParenStack.push(lookBehind, *fStatus);                       // Frame type
   1.677 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
   1.678 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
   1.679 +
   1.680 +            // The final two instructions will be added when the ')' is encountered.
   1.681 +        }
   1.682 +
   1.683 +        break;
   1.684 +
   1.685 +    case doOpenLookBehindNeg:
   1.686 +        {
   1.687 +            //   Compile a (?<! negated look-behind open paren.
   1.688 +            //
   1.689 +            //          Compiles to
   1.690 +            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
   1.691 +            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
   1.692 +            //              2                        MinMatchLen
   1.693 +            //              3                        MaxMatchLen
   1.694 +            //              4                        continueLoc (9)
   1.695 +            //              5       URX_NOP          Standard '(' boilerplate.
   1.696 +            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
   1.697 +            //              7         <code for LookBehind expression>
   1.698 +            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
   1.699 +            //              9       ...
   1.700 +            //
   1.701 +            //          Allocate a block of matcher data, to contain (when running a match)
   1.702 +            //              0:    Stack ptr on entry
   1.703 +            //              1:    Input Index on entry
   1.704 +            //              2:    Start index of match current match attempt.
   1.705 +            //              3:    Original Input String len.
   1.706 +
   1.707 +            // Generate match code for any pending literals.
   1.708 +            fixLiterals();
   1.709 +
   1.710 +            // Allocate data space
   1.711 +            int32_t dataLoc = fRXPat->fDataSize;
   1.712 +            fRXPat->fDataSize += 4;
   1.713 +
   1.714 +            // Emit URX_LB_START
   1.715 +            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
   1.716 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.717 +
   1.718 +            // Emit URX_LBN_CONT
   1.719 +            op = URX_BUILD(URX_LBN_CONT, dataLoc);
   1.720 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.721 +            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
   1.722 +            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
   1.723 +            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // Continue Loc.    To be filled later.
   1.724 +
   1.725 +            // Emit the NOP
   1.726 +            op = URX_BUILD(URX_NOP, 0);
   1.727 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.728 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.729 +
   1.730 +            // On the Parentheses stack, start a new frame and add the postions
   1.731 +            //   of the URX_LB_CONT and the NOP.
   1.732 +            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
   1.733 +            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
   1.734 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
   1.735 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
   1.736 +
   1.737 +            // The final two instructions will be added when the ')' is encountered.
   1.738 +        }
   1.739 +        break;
   1.740 +
   1.741 +    case doConditionalExpr:
   1.742 +        // Conditionals such as (?(1)a:b)
   1.743 +    case doPerlInline:
   1.744 +        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
   1.745 +        error(U_REGEX_UNIMPLEMENTED);
   1.746 +        break;
   1.747 +
   1.748 +
   1.749 +    case doCloseParen:
   1.750 +        handleCloseParen();
   1.751 +        if (fParenStack.size() <= 0) {
   1.752 +            //  Extra close paren, or missing open paren.
   1.753 +            error(U_REGEX_MISMATCHED_PAREN);
   1.754 +        }
   1.755 +        break;
   1.756 +
   1.757 +    case doNOP:
   1.758 +        break;
   1.759 +
   1.760 +
   1.761 +    case doBadOpenParenType:
   1.762 +    case doRuleError:
   1.763 +        error(U_REGEX_RULE_SYNTAX);
   1.764 +        break;
   1.765 +
   1.766 +
   1.767 +    case doMismatchedParenErr:
   1.768 +        error(U_REGEX_MISMATCHED_PAREN);
   1.769 +        break;
   1.770 +
   1.771 +    case doPlus:
   1.772 +        //  Normal '+'  compiles to
   1.773 +        //     1.   stuff to be repeated  (already built)
   1.774 +        //     2.   jmp-sav 1
   1.775 +        //     3.   ...
   1.776 +        //
   1.777 +        //  Or, if the item to be repeated can match a zero length string,
   1.778 +        //     1.   STO_INP_LOC  data-loc
   1.779 +        //     2.      body of stuff to be repeated
   1.780 +        //     3.   JMP_SAV_X    2
   1.781 +        //     4.   ...
   1.782 +
   1.783 +        //
   1.784 +        //  Or, if the item to be repeated is simple
   1.785 +        //     1.   Item to be repeated.
   1.786 +        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
   1.787 +        //     3.   LOOP_C       stack location
   1.788 +        {
   1.789 +            int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
   1.790 +            int32_t  frameLoc;
   1.791 +
   1.792 +            // Check for simple constructs, which may get special optimized code.
   1.793 +            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
   1.794 +                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
   1.795 +
   1.796 +                if (URX_TYPE(repeatedOp) == URX_SETREF) {
   1.797 +                    // Emit optimized code for [char set]+
   1.798 +                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
   1.799 +                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
   1.800 +                    frameLoc = fRXPat->fFrameSize;
   1.801 +                    fRXPat->fFrameSize++;
   1.802 +                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
   1.803 +                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
   1.804 +                    break;
   1.805 +                }
   1.806 +
   1.807 +                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
   1.808 +                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
   1.809 +                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
   1.810 +                    // Emit Optimized code for .+ operations.
   1.811 +                    int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
   1.812 +                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
   1.813 +                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
   1.814 +                        loopOpI |= 1;
   1.815 +                    }
   1.816 +                    if (fModeFlags & UREGEX_UNIX_LINES) {
   1.817 +                        loopOpI |= 2;
   1.818 +                    }
   1.819 +                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
   1.820 +                    frameLoc = fRXPat->fFrameSize;
   1.821 +                    fRXPat->fFrameSize++;
   1.822 +                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
   1.823 +                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
   1.824 +                    break;
   1.825 +                }
   1.826 +
   1.827 +            }
   1.828 +
   1.829 +            // General case.
   1.830 +
   1.831 +            // Check for minimum match length of zero, which requires
   1.832 +            //    extra loop-breaking code.
   1.833 +            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
   1.834 +                // Zero length match is possible.
   1.835 +                // Emit the code sequence that can handle it.
   1.836 +                insertOp(topLoc);
   1.837 +                frameLoc =  fRXPat->fFrameSize;
   1.838 +                fRXPat->fFrameSize++;
   1.839 +
   1.840 +                int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
   1.841 +                fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1.842 +
   1.843 +                op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
   1.844 +                fRXPat->fCompiledPat->addElement(op, *fStatus);
   1.845 +            } else {
   1.846 +                // Simpler code when the repeated body must match something non-empty
   1.847 +                int32_t  jmpOp  = URX_BUILD(URX_JMP_SAV, topLoc);
   1.848 +                fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
   1.849 +            }
   1.850 +        }
   1.851 +        break;
   1.852 +
   1.853 +    case doNGPlus:
   1.854 +        //  Non-greedy '+?'  compiles to
   1.855 +        //     1.   stuff to be repeated  (already built)
   1.856 +        //     2.   state-save  1
   1.857 +        //     3.   ...
   1.858 +        {
   1.859 +            int32_t topLoc      = blockTopLoc(FALSE);
   1.860 +            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
   1.861 +            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
   1.862 +        }
   1.863 +        break;
   1.864 +
   1.865 +
   1.866 +    case doOpt:
   1.867 +        // Normal (greedy) ? quantifier.
   1.868 +        //  Compiles to
   1.869 +        //     1. state save 3
   1.870 +        //     2.    body of optional block
   1.871 +        //     3. ...
   1.872 +        // Insert the state save into the compiled pattern, and we're done.
   1.873 +        {
   1.874 +            int32_t   saveStateLoc = blockTopLoc(TRUE);
   1.875 +            int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
   1.876 +            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
   1.877 +        }
   1.878 +        break;
   1.879 +
   1.880 +    case doNGOpt:
   1.881 +        // Non-greedy ?? quantifier
   1.882 +        //   compiles to
   1.883 +        //    1.  jmp   4
   1.884 +        //    2.     body of optional block
   1.885 +        //    3   jmp   5
   1.886 +        //    4.  state save 2
   1.887 +        //    5    ...
   1.888 +        //  This code is less than ideal, with two jmps instead of one, because we can only
   1.889 +        //  insert one instruction at the top of the block being iterated.
   1.890 +        {
   1.891 +            int32_t  jmp1_loc = blockTopLoc(TRUE);
   1.892 +            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
   1.893 +
   1.894 +            int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
   1.895 +            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
   1.896 +
   1.897 +            int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
   1.898 +            fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
   1.899 +
   1.900 +            int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
   1.901 +            fRXPat->fCompiledPat->addElement(save_op, *fStatus);
   1.902 +        }
   1.903 +        break;
   1.904 +
   1.905 +
   1.906 +    case doStar:
   1.907 +        // Normal (greedy) * quantifier.
   1.908 +        // Compiles to
   1.909 +        //       1.   STATE_SAVE   4
   1.910 +        //       2.      body of stuff being iterated over
   1.911 +        //       3.   JMP_SAV      2
   1.912 +        //       4.   ...
   1.913 +        //
   1.914 +        // Or, if the body is a simple [Set],
   1.915 +        //       1.   LOOP_SR_I    set number
   1.916 +        //       2.   LOOP_C       stack location
   1.917 +        //       ...
   1.918 +        //
   1.919 +        // Or if this is a .*
   1.920 +        //       1.   LOOP_DOT_I    (. matches all mode flag)
   1.921 +        //       2.   LOOP_C        stack location
   1.922 +        //
   1.923 +        // Or, if the body can match a zero-length string, to inhibit infinite loops,
   1.924 +        //       1.   STATE_SAVE   5
   1.925 +        //       2.   STO_INP_LOC  data-loc
   1.926 +        //       3.      body of stuff
   1.927 +        //       4.   JMP_SAV_X    2
   1.928 +        //       5.   ...
   1.929 +        {
   1.930 +            // location of item #1, the STATE_SAVE
   1.931 +            int32_t   topLoc = blockTopLoc(FALSE);
   1.932 +            int32_t   dataLoc = -1;
   1.933 +
   1.934 +            // Check for simple *, where the construct being repeated
   1.935 +            //   compiled to single opcode, and might be optimizable.
   1.936 +            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
   1.937 +                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
   1.938 +
   1.939 +                if (URX_TYPE(repeatedOp) == URX_SETREF) {
   1.940 +                    // Emit optimized code for a [char set]*
   1.941 +                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
   1.942 +                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
   1.943 +                    dataLoc = fRXPat->fFrameSize;
   1.944 +                    fRXPat->fFrameSize++;
   1.945 +                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
   1.946 +                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
   1.947 +                    break;
   1.948 +                }
   1.949 +
   1.950 +                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
   1.951 +                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
   1.952 +                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
   1.953 +                    // Emit Optimized code for .* operations.
   1.954 +                    int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
   1.955 +                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
   1.956 +                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
   1.957 +                        loopOpI |= 1;
   1.958 +                    }
   1.959 +                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1.960 +                        loopOpI |= 2;
   1.961 +                    }
   1.962 +                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
   1.963 +                    dataLoc = fRXPat->fFrameSize;
   1.964 +                    fRXPat->fFrameSize++;
   1.965 +                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
   1.966 +                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
   1.967 +                    break;
   1.968 +                }
   1.969 +            }
   1.970 +
   1.971 +            // Emit general case code for this *
   1.972 +            // The optimizations did not apply.
   1.973 +
   1.974 +            int32_t   saveStateLoc = blockTopLoc(TRUE);
   1.975 +            int32_t   jmpOp        = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
   1.976 +
   1.977 +            // Check for minimum match length of zero, which requires
   1.978 +            //    extra loop-breaking code.
   1.979 +            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
   1.980 +                insertOp(saveStateLoc);
   1.981 +                dataLoc =  fRXPat->fFrameSize;
   1.982 +                fRXPat->fFrameSize++;
   1.983 +
   1.984 +                int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
   1.985 +                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
   1.986 +                jmpOp      = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
   1.987 +            }
   1.988 +
   1.989 +            // Locate the position in the compiled pattern where the match will continue
   1.990 +            //   after completing the *.   (4 or 5 in the comment above)
   1.991 +            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
   1.992 +
   1.993 +            // Put together the save state op store it into the compiled code.
   1.994 +            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
   1.995 +            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
   1.996 +
   1.997 +            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
   1.998 +            fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
   1.999 +        }
  1.1000 +        break;
  1.1001 +
  1.1002 +    case doNGStar:
  1.1003 +        // Non-greedy *? quantifier
  1.1004 +        // compiles to
  1.1005 +        //     1.   JMP    3
  1.1006 +        //     2.      body of stuff being iterated over
  1.1007 +        //     3.   STATE_SAVE  2
  1.1008 +        //     4    ...
  1.1009 +        {
  1.1010 +            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
  1.1011 +            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
  1.1012 +            int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
  1.1013 +            int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
  1.1014 +            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
  1.1015 +            fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
  1.1016 +        }
  1.1017 +        break;
  1.1018 +
  1.1019 +
  1.1020 +    case doIntervalInit:
  1.1021 +        // The '{' opening an interval quantifier was just scanned.
  1.1022 +        // Init the counter varaiables that will accumulate the values as the digits
  1.1023 +        //    are scanned.
  1.1024 +        fIntervalLow = 0;
  1.1025 +        fIntervalUpper = -1;
  1.1026 +        break;
  1.1027 +
  1.1028 +    case doIntevalLowerDigit:
  1.1029 +        // Scanned a digit from the lower value of an {lower,upper} interval
  1.1030 +        {
  1.1031 +            int32_t digitValue = u_charDigitValue(fC.fChar);
  1.1032 +            U_ASSERT(digitValue >= 0);
  1.1033 +            fIntervalLow = fIntervalLow*10 + digitValue;
  1.1034 +            if (fIntervalLow < 0) {
  1.1035 +                error(U_REGEX_NUMBER_TOO_BIG);
  1.1036 +            }
  1.1037 +        }
  1.1038 +        break;
  1.1039 +
  1.1040 +    case doIntervalUpperDigit:
  1.1041 +        // Scanned a digit from the upper value of an {lower,upper} interval
  1.1042 +        {
  1.1043 +            if (fIntervalUpper < 0) {
  1.1044 +                fIntervalUpper = 0;
  1.1045 +            }
  1.1046 +            int32_t digitValue = u_charDigitValue(fC.fChar);
  1.1047 +            U_ASSERT(digitValue >= 0);
  1.1048 +            fIntervalUpper = fIntervalUpper*10 + digitValue;
  1.1049 +            if (fIntervalUpper < 0) {
  1.1050 +                error(U_REGEX_NUMBER_TOO_BIG);
  1.1051 +            }
  1.1052 +        }
  1.1053 +        break;
  1.1054 +
  1.1055 +    case doIntervalSame:
  1.1056 +        // Scanned a single value interval like {27}.  Upper = Lower.
  1.1057 +        fIntervalUpper = fIntervalLow;
  1.1058 +        break;
  1.1059 +
  1.1060 +    case doInterval:
  1.1061 +        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
  1.1062 +        if (compileInlineInterval() == FALSE) {
  1.1063 +            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
  1.1064 +        }
  1.1065 +        break;
  1.1066 +
  1.1067 +    case doPossessiveInterval:
  1.1068 +        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
  1.1069 +        {
  1.1070 +            // Remember the loc for the top of the block being looped over.
  1.1071 +            //   (Can not reserve a slot in the compiled pattern at this time, because
  1.1072 +            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
  1.1073 +            //    once per block.)
  1.1074 +            int32_t topLoc = blockTopLoc(FALSE);
  1.1075 +
  1.1076 +            // Produce normal looping code.
  1.1077 +            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
  1.1078 +
  1.1079 +            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
  1.1080 +            //  just as if the loop was inclosed in atomic parentheses.
  1.1081 +
  1.1082 +            // First the STO_SP before the start of the loop
  1.1083 +            insertOp(topLoc);
  1.1084 +            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
  1.1085 +            fRXPat->fDataSize += 1;                    //  state stack ptr.
  1.1086 +            int32_t  op        = URX_BUILD(URX_STO_SP, varLoc);
  1.1087 +            fRXPat->fCompiledPat->setElementAt(op, topLoc);
  1.1088 +
  1.1089 +            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
  1.1090 +            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
  1.1091 +            loopOp++;     // point LoopOp after the just-inserted STO_SP
  1.1092 +            fRXPat->fCompiledPat->push(loopOp, *fStatus);
  1.1093 +
  1.1094 +            // Then the LD_SP after the end of the loop
  1.1095 +            op = URX_BUILD(URX_LD_SP, varLoc);
  1.1096 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1097 +        }
  1.1098 +
  1.1099 +        break;
  1.1100 +
  1.1101 +    case doNGInterval:
  1.1102 +        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
  1.1103 +        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
  1.1104 +        break;
  1.1105 +
  1.1106 +    case doIntervalError:
  1.1107 +        error(U_REGEX_BAD_INTERVAL);
  1.1108 +        break;
  1.1109 +
  1.1110 +    case doLiteralChar:
  1.1111 +        // We've just scanned a "normal" character from the pattern,
  1.1112 +        literalChar(fC.fChar);
  1.1113 +        break;
  1.1114 +
  1.1115 +
  1.1116 +    case doEscapedLiteralChar:
  1.1117 +        // We've just scanned an backslashed escaped character with  no
  1.1118 +        //   special meaning.  It represents itself.
  1.1119 +        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
  1.1120 +            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
  1.1121 +            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
  1.1122 +               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.1123 +             }
  1.1124 +        literalChar(fC.fChar);
  1.1125 +        break;
  1.1126 +
  1.1127 +
  1.1128 +    case doDotAny:
  1.1129 +        // scanned a ".",  match any single character.
  1.1130 +        {
  1.1131 +            fixLiterals(FALSE);
  1.1132 +            int32_t   op;
  1.1133 +            if (fModeFlags & UREGEX_DOTALL) {
  1.1134 +                op = URX_BUILD(URX_DOTANY_ALL, 0);
  1.1135 +            } else if (fModeFlags & UREGEX_UNIX_LINES) {
  1.1136 +                op = URX_BUILD(URX_DOTANY_UNIX, 0);
  1.1137 +            } else {
  1.1138 +                op = URX_BUILD(URX_DOTANY, 0);
  1.1139 +            }
  1.1140 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1141 +        }
  1.1142 +        break;
  1.1143 +
  1.1144 +    case doCaret:
  1.1145 +        {
  1.1146 +            fixLiterals(FALSE);
  1.1147 +            int32_t op = 0;
  1.1148 +            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
  1.1149 +                op = URX_CARET;
  1.1150 +            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
  1.1151 +                op = URX_CARET_M;
  1.1152 +            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
  1.1153 +                op = URX_CARET;   // Only testing true start of input. 
  1.1154 +            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
  1.1155 +                op = URX_CARET_M_UNIX;
  1.1156 +            }
  1.1157 +            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
  1.1158 +        }
  1.1159 +        break;
  1.1160 +
  1.1161 +    case doDollar:
  1.1162 +        {
  1.1163 +            fixLiterals(FALSE);
  1.1164 +            int32_t op = 0;
  1.1165 +            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
  1.1166 +                op = URX_DOLLAR;
  1.1167 +            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
  1.1168 +                op = URX_DOLLAR_M;
  1.1169 +            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
  1.1170 +                op = URX_DOLLAR_D;
  1.1171 +            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
  1.1172 +                op = URX_DOLLAR_MD;
  1.1173 +            }
  1.1174 +            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
  1.1175 +        }
  1.1176 +        break;
  1.1177 +
  1.1178 +    case doBackslashA:
  1.1179 +        fixLiterals(FALSE);
  1.1180 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
  1.1181 +        break;
  1.1182 +
  1.1183 +    case doBackslashB:
  1.1184 +        {
  1.1185 +            #if  UCONFIG_NO_BREAK_ITERATION==1
  1.1186 +            if (fModeFlags & UREGEX_UWORD) {
  1.1187 +                error(U_UNSUPPORTED_ERROR);
  1.1188 +            }
  1.1189 +            #endif
  1.1190 +            fixLiterals(FALSE);
  1.1191 +            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
  1.1192 +            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
  1.1193 +        }
  1.1194 +        break;
  1.1195 +
  1.1196 +    case doBackslashb:
  1.1197 +        {
  1.1198 +            #if  UCONFIG_NO_BREAK_ITERATION==1
  1.1199 +            if (fModeFlags & UREGEX_UWORD) {
  1.1200 +                error(U_UNSUPPORTED_ERROR);
  1.1201 +            }
  1.1202 +            #endif
  1.1203 +            fixLiterals(FALSE);
  1.1204 +            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
  1.1205 +            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
  1.1206 +        }
  1.1207 +        break;
  1.1208 +
  1.1209 +    case doBackslashD:
  1.1210 +        fixLiterals(FALSE);
  1.1211 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
  1.1212 +        break;
  1.1213 +
  1.1214 +    case doBackslashd:
  1.1215 +        fixLiterals(FALSE);
  1.1216 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
  1.1217 +        break;
  1.1218 +
  1.1219 +    case doBackslashG:
  1.1220 +        fixLiterals(FALSE);
  1.1221 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
  1.1222 +        break;
  1.1223 +
  1.1224 +    case doBackslashS:
  1.1225 +        fixLiterals(FALSE);
  1.1226 +        fRXPat->fCompiledPat->addElement(
  1.1227 +            URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
  1.1228 +        break;
  1.1229 +
  1.1230 +    case doBackslashs:
  1.1231 +        fixLiterals(FALSE);
  1.1232 +        fRXPat->fCompiledPat->addElement(
  1.1233 +            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
  1.1234 +        break;
  1.1235 +
  1.1236 +    case doBackslashW:
  1.1237 +        fixLiterals(FALSE);
  1.1238 +        fRXPat->fCompiledPat->addElement(
  1.1239 +            URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
  1.1240 +        break;
  1.1241 +
  1.1242 +    case doBackslashw:
  1.1243 +        fixLiterals(FALSE);
  1.1244 +        fRXPat->fCompiledPat->addElement(
  1.1245 +            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
  1.1246 +        break;
  1.1247 +
  1.1248 +    case doBackslashX:
  1.1249 +        fixLiterals(FALSE);
  1.1250 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
  1.1251 +        break;
  1.1252 +
  1.1253 +
  1.1254 +    case doBackslashZ:
  1.1255 +        fixLiterals(FALSE);
  1.1256 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
  1.1257 +        break;
  1.1258 +
  1.1259 +    case doBackslashz:
  1.1260 +        fixLiterals(FALSE);
  1.1261 +        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
  1.1262 +        break;
  1.1263 +
  1.1264 +    case doEscapeError:
  1.1265 +        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.1266 +        break;
  1.1267 +
  1.1268 +    case doExit:
  1.1269 +        fixLiterals(FALSE);
  1.1270 +        returnVal = FALSE;
  1.1271 +        break;
  1.1272 +
  1.1273 +    case doProperty:
  1.1274 +        {
  1.1275 +            fixLiterals(FALSE);
  1.1276 +            UnicodeSet *theSet = scanProp();
  1.1277 +            compileSet(theSet);
  1.1278 +        }
  1.1279 +        break;
  1.1280 +
  1.1281 +    case doNamedChar:
  1.1282 +        {
  1.1283 +            UChar32 c = scanNamedChar();
  1.1284 +            literalChar(c);
  1.1285 +        }
  1.1286 +        break;
  1.1287 +        
  1.1288 +
  1.1289 +    case doBackRef:
  1.1290 +        // BackReference.  Somewhat unusual in that the front-end can not completely parse
  1.1291 +        //                 the regular expression, because the number of digits to be consumed
  1.1292 +        //                 depends on the number of capture groups that have been defined.  So
  1.1293 +        //                 we have to do it here instead.
  1.1294 +        {
  1.1295 +            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
  1.1296 +            int32_t  groupNum = 0;
  1.1297 +            UChar32  c        = fC.fChar;
  1.1298 +
  1.1299 +            for (;;) {
  1.1300 +                // Loop once per digit, for max allowed number of digits in a back reference.
  1.1301 +                int32_t digit = u_charDigitValue(c);
  1.1302 +                groupNum = groupNum * 10 + digit;
  1.1303 +                if (groupNum >= numCaptureGroups) {
  1.1304 +                    break;
  1.1305 +                }
  1.1306 +                c = peekCharLL();
  1.1307 +                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
  1.1308 +                    break;
  1.1309 +                }
  1.1310 +                nextCharLL();
  1.1311 +            }
  1.1312 +
  1.1313 +            // Scan of the back reference in the source regexp is complete.  Now generate
  1.1314 +            //  the compiled code for it.
  1.1315 +            // Because capture groups can be forward-referenced by back-references,
  1.1316 +            //  we fill the operand with the capture group number.  At the end
  1.1317 +            //  of compilation, it will be changed to the variable's location.
  1.1318 +            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
  1.1319 +                                     //    and shouldn't enter this code path at all.
  1.1320 +            fixLiterals(FALSE);
  1.1321 +            int32_t  op;
  1.1322 +            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
  1.1323 +                op = URX_BUILD(URX_BACKREF_I, groupNum);
  1.1324 +            } else {
  1.1325 +                op = URX_BUILD(URX_BACKREF, groupNum);
  1.1326 +            }
  1.1327 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1328 +        }
  1.1329 +        break;
  1.1330 +
  1.1331 +
  1.1332 +    case doPossessivePlus:
  1.1333 +        // Possessive ++ quantifier.
  1.1334 +        // Compiles to
  1.1335 +        //       1.   STO_SP
  1.1336 +        //       2.      body of stuff being iterated over
  1.1337 +        //       3.   STATE_SAVE 5
  1.1338 +        //       4.   JMP        2
  1.1339 +        //       5.   LD_SP
  1.1340 +        //       6.   ...
  1.1341 +        //
  1.1342 +        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
  1.1343 +        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
  1.1344 +        //
  1.1345 +        {
  1.1346 +            // Emit the STO_SP
  1.1347 +            int32_t   topLoc = blockTopLoc(TRUE);
  1.1348 +            int32_t   stoLoc = fRXPat->fDataSize;
  1.1349 +            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
  1.1350 +            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
  1.1351 +            fRXPat->fCompiledPat->setElementAt(op, topLoc);
  1.1352 +
  1.1353 +            // Emit the STATE_SAVE
  1.1354 +            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
  1.1355 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1356 +
  1.1357 +            // Emit the JMP
  1.1358 +            op = URX_BUILD(URX_JMP, topLoc+1);
  1.1359 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1360 +
  1.1361 +            // Emit the LD_SP
  1.1362 +            op = URX_BUILD(URX_LD_SP, stoLoc);
  1.1363 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1364 +        }
  1.1365 +        break;
  1.1366 +
  1.1367 +    case doPossessiveStar:
  1.1368 +        // Possessive *+ quantifier.
  1.1369 +        // Compiles to
  1.1370 +        //       1.   STO_SP       loc
  1.1371 +        //       2.   STATE_SAVE   5
  1.1372 +        //       3.      body of stuff being iterated over
  1.1373 +        //       4.   JMP          2
  1.1374 +        //       5.   LD_SP        loc
  1.1375 +        //       6    ...
  1.1376 +        // TODO:  do something to cut back the state stack each time through the loop.
  1.1377 +        {
  1.1378 +            // Reserve two slots at the top of the block.
  1.1379 +            int32_t   topLoc = blockTopLoc(TRUE);
  1.1380 +            insertOp(topLoc);
  1.1381 +
  1.1382 +            // emit   STO_SP     loc
  1.1383 +            int32_t   stoLoc = fRXPat->fDataSize;
  1.1384 +            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
  1.1385 +            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
  1.1386 +            fRXPat->fCompiledPat->setElementAt(op, topLoc);
  1.1387 +
  1.1388 +            // Emit the SAVE_STATE   5
  1.1389 +            int32_t L7 = fRXPat->fCompiledPat->size()+1;
  1.1390 +            op = URX_BUILD(URX_STATE_SAVE, L7);
  1.1391 +            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
  1.1392 +
  1.1393 +            // Append the JMP operation.
  1.1394 +            op = URX_BUILD(URX_JMP, topLoc+1);
  1.1395 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1396 +
  1.1397 +            // Emit the LD_SP       loc
  1.1398 +            op = URX_BUILD(URX_LD_SP, stoLoc);
  1.1399 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1400 +        }
  1.1401 +        break;
  1.1402 +
  1.1403 +    case doPossessiveOpt:
  1.1404 +        // Possessive  ?+ quantifier.
  1.1405 +        //  Compiles to
  1.1406 +        //     1. STO_SP      loc
  1.1407 +        //     2. SAVE_STATE  5
  1.1408 +        //     3.    body of optional block
  1.1409 +        //     4. LD_SP       loc
  1.1410 +        //     5. ...
  1.1411 +        //
  1.1412 +        {
  1.1413 +            // Reserve two slots at the top of the block.
  1.1414 +            int32_t   topLoc = blockTopLoc(TRUE);
  1.1415 +            insertOp(topLoc);
  1.1416 +
  1.1417 +            // Emit the STO_SP
  1.1418 +            int32_t   stoLoc = fRXPat->fDataSize;
  1.1419 +            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
  1.1420 +            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
  1.1421 +            fRXPat->fCompiledPat->setElementAt(op, topLoc);
  1.1422 +
  1.1423 +            // Emit the SAVE_STATE
  1.1424 +            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
  1.1425 +            op = URX_BUILD(URX_STATE_SAVE, continueLoc);
  1.1426 +            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
  1.1427 +
  1.1428 +            // Emit the LD_SP
  1.1429 +            op = URX_BUILD(URX_LD_SP, stoLoc);
  1.1430 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1431 +        }
  1.1432 +        break;
  1.1433 +
  1.1434 +
  1.1435 +    case doBeginMatchMode:
  1.1436 +        fNewModeFlags = fModeFlags;
  1.1437 +        fSetModeFlag  = TRUE;
  1.1438 +        break;
  1.1439 +
  1.1440 +    case doMatchMode:   //  (?i)    and similar
  1.1441 +        {
  1.1442 +            int32_t  bit = 0;
  1.1443 +            switch (fC.fChar) {
  1.1444 +            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
  1.1445 +            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
  1.1446 +            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
  1.1447 +            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
  1.1448 +            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
  1.1449 +            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
  1.1450 +            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
  1.1451 +            case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
  1.1452 +            default:
  1.1453 +                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
  1.1454 +                                   // by the scanner.
  1.1455 +            }
  1.1456 +            if (fSetModeFlag) {
  1.1457 +                fNewModeFlags |= bit;
  1.1458 +            } else {
  1.1459 +                fNewModeFlags &= ~bit;
  1.1460 +            }
  1.1461 +        }
  1.1462 +        break;
  1.1463 +
  1.1464 +    case doSetMatchMode:
  1.1465 +        // Emit code to match any pending literals, using the not-yet changed match mode.
  1.1466 +        fixLiterals();
  1.1467 +
  1.1468 +        // We've got a (?i) or similar.  The match mode is being changed, but
  1.1469 +        //   the change is not scoped to a parenthesized block.
  1.1470 +        U_ASSERT(fNewModeFlags < 0);
  1.1471 +        fModeFlags = fNewModeFlags;
  1.1472 +
  1.1473 +        break;
  1.1474 +
  1.1475 +
  1.1476 +    case doMatchModeParen:
  1.1477 +        // We've got a (?i: or similar.  Begin a parenthesized block, save old
  1.1478 +        //   mode flags so they can be restored at the close of the block.
  1.1479 +        //
  1.1480 +        //   Compile to a
  1.1481 +        //      - NOP, which later may be replaced by a save-state if the
  1.1482 +        //         parenthesized group gets a * quantifier, followed by
  1.1483 +        //      - NOP, which may later be replaced by a save-state if there
  1.1484 +        //             is an '|' alternation within the parens.
  1.1485 +        {
  1.1486 +            fixLiterals(FALSE);
  1.1487 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
  1.1488 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
  1.1489 +
  1.1490 +            // On the Parentheses stack, start a new frame and add the postions
  1.1491 +            //   of the two NOPs (a normal non-capturing () frame, except for the
  1.1492 +            //   saving of the orignal mode flags.)
  1.1493 +            fParenStack.push(fModeFlags, *fStatus);
  1.1494 +            fParenStack.push(flags, *fStatus);                            // Frame Marker
  1.1495 +            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
  1.1496 +            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
  1.1497 +
  1.1498 +            // Set the current mode flags to the new values.
  1.1499 +            U_ASSERT(fNewModeFlags < 0);
  1.1500 +            fModeFlags = fNewModeFlags;
  1.1501 +        }
  1.1502 +        break;
  1.1503 +
  1.1504 +    case doBadModeFlag:
  1.1505 +        error(U_REGEX_INVALID_FLAG);
  1.1506 +        break;
  1.1507 +
  1.1508 +    case doSuppressComments:
  1.1509 +        // We have just scanned a '(?'.  We now need to prevent the character scanner from
  1.1510 +        // treating a '#' as a to-the-end-of-line comment.
  1.1511 +        //   (This Perl compatibility just gets uglier and uglier to do...)
  1.1512 +        fEOLComments = FALSE;
  1.1513 +        break;
  1.1514 +
  1.1515 +
  1.1516 +    case doSetAddAmp:
  1.1517 +        {
  1.1518 +          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1519 +          set->add(chAmp);
  1.1520 +        }
  1.1521 +        break;
  1.1522 +
  1.1523 +    case doSetAddDash:
  1.1524 +        {
  1.1525 +          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1526 +          set->add(chDash);
  1.1527 +        }
  1.1528 +        break;
  1.1529 +
  1.1530 +     case doSetBackslash_s:
  1.1531 +        {
  1.1532 +         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1533 +         set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
  1.1534 +         break;
  1.1535 +        }
  1.1536 +
  1.1537 +     case doSetBackslash_S:
  1.1538 +        {
  1.1539 +            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1540 +            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
  1.1541 +            SSet.complement();
  1.1542 +            set->addAll(SSet);
  1.1543 +            break;
  1.1544 +        }
  1.1545 +
  1.1546 +    case doSetBackslash_d:
  1.1547 +        {
  1.1548 +            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1549 +            // TODO - make a static set, ticket 6058.
  1.1550 +            addCategory(set, U_GC_ND_MASK, *fStatus);
  1.1551 +            break;
  1.1552 +        }
  1.1553 +
  1.1554 +    case doSetBackslash_D:
  1.1555 +        {
  1.1556 +            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1557 +            UnicodeSet digits;
  1.1558 +            // TODO - make a static set, ticket 6058.
  1.1559 +            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
  1.1560 +            digits.complement();
  1.1561 +            set->addAll(digits);
  1.1562 +            break;
  1.1563 +        }
  1.1564 +
  1.1565 +    case doSetBackslash_w:
  1.1566 +        {
  1.1567 +            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1568 +            set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
  1.1569 +            break;
  1.1570 +        }
  1.1571 +
  1.1572 +    case doSetBackslash_W:
  1.1573 +        {
  1.1574 +            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
  1.1575 +            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
  1.1576 +            SSet.complement();
  1.1577 +            set->addAll(SSet);
  1.1578 +            break;
  1.1579 +        }
  1.1580 +
  1.1581 +    case doSetBegin:
  1.1582 +        fixLiterals(FALSE);
  1.1583 +        fSetStack.push(new UnicodeSet(), *fStatus);
  1.1584 +        fSetOpStack.push(setStart, *fStatus);
  1.1585 +        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
  1.1586 +            fSetOpStack.push(setCaseClose, *fStatus);
  1.1587 +        }
  1.1588 +        break;
  1.1589 +
  1.1590 +    case doSetBeginDifference1:
  1.1591 +        //  We have scanned something like [[abc]-[
  1.1592 +        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
  1.1593 +        //  Push a Difference operator, which will cause the new set to be subtracted from what
  1.1594 +        //    went before once it is created.
  1.1595 +        setPushOp(setDifference1);
  1.1596 +        fSetOpStack.push(setStart, *fStatus);
  1.1597 +        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
  1.1598 +            fSetOpStack.push(setCaseClose, *fStatus);
  1.1599 +        }
  1.1600 +        break;
  1.1601 +
  1.1602 +    case doSetBeginIntersection1:
  1.1603 +        //  We have scanned something like  [[abc]&[
  1.1604 +        //   Need both the '&' operator and the open '[' operator.
  1.1605 +        setPushOp(setIntersection1);
  1.1606 +        fSetOpStack.push(setStart, *fStatus);
  1.1607 +        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
  1.1608 +            fSetOpStack.push(setCaseClose, *fStatus);
  1.1609 +        }
  1.1610 +        break;
  1.1611 +
  1.1612 +    case doSetBeginUnion:
  1.1613 +        //  We have scanned something like  [[abc][
  1.1614 +        //     Need to handle the union operation explicitly [[abc] | [
  1.1615 +        setPushOp(setUnion);
  1.1616 +        fSetOpStack.push(setStart, *fStatus);
  1.1617 +        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
  1.1618 +            fSetOpStack.push(setCaseClose, *fStatus);
  1.1619 +        }
  1.1620 +        break;
  1.1621 +
  1.1622 +    case doSetDifference2:
  1.1623 +        // We have scanned something like [abc--
  1.1624 +        //   Consider this to unambiguously be a set difference operator.
  1.1625 +        setPushOp(setDifference2);
  1.1626 +        break;
  1.1627 +
  1.1628 +    case doSetEnd:
  1.1629 +        // Have encountered the ']' that closes a set.
  1.1630 +        //    Force the evaluation of any pending operations within this set,
  1.1631 +        //    leave the completed set on the top of the set stack.
  1.1632 +        setEval(setEnd);
  1.1633 +        U_ASSERT(fSetOpStack.peeki()==setStart);
  1.1634 +        fSetOpStack.popi();
  1.1635 +        break;
  1.1636 +
  1.1637 +    case doSetFinish:
  1.1638 +        {
  1.1639 +        // Finished a complete set expression, including all nested sets.
  1.1640 +        //   The close bracket has already triggered clearing out pending set operators,
  1.1641 +        //    the operator stack should be empty and the operand stack should have just
  1.1642 +        //    one entry, the result set.
  1.1643 +        U_ASSERT(fSetOpStack.empty());
  1.1644 +        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
  1.1645 +        U_ASSERT(fSetStack.empty());
  1.1646 +        compileSet(theSet);
  1.1647 +        break;
  1.1648 +        }
  1.1649 +        
  1.1650 +    case doSetIntersection2:
  1.1651 +        // Have scanned something like [abc&&
  1.1652 +        setPushOp(setIntersection2);
  1.1653 +        break;
  1.1654 +
  1.1655 +    case doSetLiteral:
  1.1656 +        // Union the just-scanned literal character into the set being built.
  1.1657 +        //    This operation is the highest precedence set operation, so we can always do
  1.1658 +        //    it immediately, without waiting to see what follows.  It is necessary to perform
  1.1659 +        //    any pending '-' or '&' operation first, because these have the same precedence
  1.1660 +        //    as union-ing in a literal' 
  1.1661 +        {
  1.1662 +            setEval(setUnion);
  1.1663 +            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
  1.1664 +            s->add(fC.fChar);
  1.1665 +            fLastSetLiteral = fC.fChar;
  1.1666 +            break;
  1.1667 +        }
  1.1668 +
  1.1669 +    case doSetLiteralEscaped:
  1.1670 +        // A back-slash escaped literal character was encountered.
  1.1671 +        // Processing is the same as with setLiteral, above, with the addition of
  1.1672 +        //  the optional check for errors on escaped ASCII letters.
  1.1673 +        {
  1.1674 +            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
  1.1675 +                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
  1.1676 +                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
  1.1677 +                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.1678 +            }
  1.1679 +            setEval(setUnion);
  1.1680 +            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
  1.1681 +            s->add(fC.fChar);
  1.1682 +            fLastSetLiteral = fC.fChar;
  1.1683 +            break;
  1.1684 +        }
  1.1685 +
  1.1686 +        case doSetNamedChar:
  1.1687 +        // Scanning a \N{UNICODE CHARACTER NAME}
  1.1688 +        //  Aside from the source of the character, the processing is identical to doSetLiteral,
  1.1689 +        //    above.
  1.1690 +        {
  1.1691 +            UChar32  c = scanNamedChar();
  1.1692 +            setEval(setUnion);
  1.1693 +            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
  1.1694 +            s->add(c);
  1.1695 +            fLastSetLiteral = c;
  1.1696 +            break;
  1.1697 +        }
  1.1698 +
  1.1699 +    case doSetNamedRange:
  1.1700 +        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
  1.1701 +        // The left character is already in the set, and is saved in fLastSetLiteral.
  1.1702 +        // The right side needs to be picked up, the scan is at the 'N'.
  1.1703 +        // Lower Limit > Upper limit being an error matches both Java
  1.1704 +        //        and ICU UnicodeSet behavior.
  1.1705 +        {
  1.1706 +            UChar32  c = scanNamedChar();
  1.1707 +            if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
  1.1708 +                error(U_REGEX_INVALID_RANGE);
  1.1709 +            }
  1.1710 +            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
  1.1711 +            s->add(fLastSetLiteral, c);
  1.1712 +            fLastSetLiteral = c;
  1.1713 +            break;
  1.1714 +        }
  1.1715 +
  1.1716 +
  1.1717 +    case  doSetNegate:
  1.1718 +        // Scanned a '^' at the start of a set.
  1.1719 +        // Push the negation operator onto the set op stack.
  1.1720 +        // A twist for case-insensitive matching:
  1.1721 +        //   the case closure operation must happen _before_ negation.
  1.1722 +        //   But the case closure operation will already be on the stack if it's required.
  1.1723 +        //   This requires checking for case closure, and swapping the stack order
  1.1724 +        //    if it is present.
  1.1725 +        {
  1.1726 +            int32_t  tosOp = fSetOpStack.peeki();
  1.1727 +            if (tosOp == setCaseClose) {
  1.1728 +                fSetOpStack.popi();
  1.1729 +                fSetOpStack.push(setNegation, *fStatus);
  1.1730 +                fSetOpStack.push(setCaseClose, *fStatus);
  1.1731 +            } else {
  1.1732 +                fSetOpStack.push(setNegation, *fStatus);
  1.1733 +            }
  1.1734 +        }
  1.1735 +        break;
  1.1736 +
  1.1737 +    case doSetNoCloseError:
  1.1738 +        error(U_REGEX_MISSING_CLOSE_BRACKET);
  1.1739 +        break;
  1.1740 +
  1.1741 +    case doSetOpError:
  1.1742 +        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
  1.1743 +        break;
  1.1744 +
  1.1745 +    case doSetPosixProp:
  1.1746 +        {
  1.1747 +            UnicodeSet *s = scanPosixProp();
  1.1748 +            if (s != NULL) {
  1.1749 +                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
  1.1750 +                tos->addAll(*s);
  1.1751 +                delete s;
  1.1752 +            }  // else error.  scanProp() reported the error status already.
  1.1753 +        }
  1.1754 +        break;
  1.1755 +        
  1.1756 +    case doSetProp:
  1.1757 +        //  Scanned a \p \P within [brackets].
  1.1758 +        {
  1.1759 +            UnicodeSet *s = scanProp();
  1.1760 +            if (s != NULL) {
  1.1761 +                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
  1.1762 +                tos->addAll(*s);
  1.1763 +                delete s;
  1.1764 +            }  // else error.  scanProp() reported the error status already.
  1.1765 +        }
  1.1766 +        break;
  1.1767 +
  1.1768 +
  1.1769 +    case doSetRange:
  1.1770 +        // We have scanned literal-literal.  Add the range to the set.
  1.1771 +        // The left character is already in the set, and is saved in fLastSetLiteral.
  1.1772 +        // The right side is the current character.
  1.1773 +        // Lower Limit > Upper limit being an error matches both Java
  1.1774 +        //        and ICU UnicodeSet behavior.
  1.1775 +        {
  1.1776 +        if (fLastSetLiteral > fC.fChar) {
  1.1777 +            error(U_REGEX_INVALID_RANGE);  
  1.1778 +        }
  1.1779 +        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
  1.1780 +        s->add(fLastSetLiteral, fC.fChar);
  1.1781 +        break;
  1.1782 +        }
  1.1783 +
  1.1784 +    default:
  1.1785 +        U_ASSERT(FALSE);
  1.1786 +        error(U_REGEX_INTERNAL_ERROR);
  1.1787 +        break;
  1.1788 +    }
  1.1789 +
  1.1790 +    if (U_FAILURE(*fStatus)) {
  1.1791 +        returnVal = FALSE;
  1.1792 +    }
  1.1793 +
  1.1794 +    return returnVal;
  1.1795 +}
  1.1796 +
  1.1797 +
  1.1798 +
  1.1799 +//------------------------------------------------------------------------------
  1.1800 +//
  1.1801 +//   literalChar           We've encountered a literal character from the pattern,
  1.1802 +//                             or an escape sequence that reduces to a character.
  1.1803 +//                         Add it to the string containing all literal chars/strings from
  1.1804 +//                             the pattern.
  1.1805 +//
  1.1806 +//------------------------------------------------------------------------------
  1.1807 +void RegexCompile::literalChar(UChar32 c)  {
  1.1808 +    fLiteralChars.append(c);
  1.1809 +}
  1.1810 +
  1.1811 +
  1.1812 +//------------------------------------------------------------------------------
  1.1813 +//
  1.1814 +//    fixLiterals           When compiling something that can follow a literal
  1.1815 +//                          string in a pattern, emit the code to match the
  1.1816 +//                          accumulated literal string.
  1.1817 +//
  1.1818 +//                          Optionally, split the last char of the string off into
  1.1819 +//                          a single "ONE_CHAR" operation, so that quantifiers can
  1.1820 +//                          apply to that char alone.  Example:   abc*
  1.1821 +//                          The * must apply to the 'c' only.
  1.1822 +//
  1.1823 +//------------------------------------------------------------------------------
  1.1824 +void    RegexCompile::fixLiterals(UBool split) {
  1.1825 +    int32_t  op = 0;                       // An op from/for the compiled pattern.
  1.1826 +
  1.1827 +    // If no literal characters have been scanned but not yet had code generated
  1.1828 +    //   for them, nothing needs to be done.
  1.1829 +    if (fLiteralChars.length() == 0) {
  1.1830 +        return;
  1.1831 +    }
  1.1832 +
  1.1833 +    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
  1.1834 +    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
  1.1835 +
  1.1836 +    // Split:  We need to  ensure that the last item in the compiled pattern 
  1.1837 +    //     refers only to the last literal scanned in the pattern, so that
  1.1838 +    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
  1.1839 +    //     Split before case folding for case insensitive matches.
  1.1840 +
  1.1841 +    if (split) {
  1.1842 +        fLiteralChars.truncate(indexOfLastCodePoint);
  1.1843 +        fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
  1.1844 +                              //  Note that the truncated literal string may be empty, in which case
  1.1845 +                              //  nothing will be emitted.
  1.1846 +
  1.1847 +        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
  1.1848 +        fixLiterals(FALSE);          // Second recursive call, code for the final code point.
  1.1849 +        return;
  1.1850 +    }
  1.1851 +
  1.1852 +    // If we are doing case-insensitive matching, case fold the string.  This may expand
  1.1853 +    //   the string, e.g. the German sharp-s turns into "ss"
  1.1854 +    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
  1.1855 +        fLiteralChars.foldCase();
  1.1856 +        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
  1.1857 +        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
  1.1858 +    }
  1.1859 +
  1.1860 +    if (indexOfLastCodePoint == 0) {
  1.1861 +        // Single character, emit a URX_ONECHAR op to match it.
  1.1862 +        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 
  1.1863 +                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
  1.1864 +            op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
  1.1865 +        } else {
  1.1866 +            op = URX_BUILD(URX_ONECHAR, lastCodePoint);
  1.1867 +        }
  1.1868 +        fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1869 +    } else {
  1.1870 +        // Two or more chars, emit a URX_STRING to match them.
  1.1871 +        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
  1.1872 +            op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
  1.1873 +        } else {
  1.1874 +            // TODO here:  add optimization to split case sensitive strings of length two
  1.1875 +            //             into two single char ops, for efficiency.
  1.1876 +            op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
  1.1877 +        }
  1.1878 +        fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1879 +        op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
  1.1880 +        fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.1881 +        
  1.1882 +        // Add this string into the accumulated strings of the compiled pattern.
  1.1883 +        fRXPat->fLiteralText.append(fLiteralChars);
  1.1884 +    }
  1.1885 +
  1.1886 +    fLiteralChars.remove();
  1.1887 +}
  1.1888 +
  1.1889 +
  1.1890 +
  1.1891 +
  1.1892 +
  1.1893 +
  1.1894 +//------------------------------------------------------------------------------
  1.1895 +//
  1.1896 +//   insertOp()             Insert a slot for a new opcode into the already
  1.1897 +//                          compiled pattern code.
  1.1898 +//
  1.1899 +//                          Fill the slot with a NOP.  Our caller will replace it
  1.1900 +//                          with what they really wanted.
  1.1901 +//
  1.1902 +//------------------------------------------------------------------------------
  1.1903 +void   RegexCompile::insertOp(int32_t where) {
  1.1904 +    UVector64 *code = fRXPat->fCompiledPat;
  1.1905 +    U_ASSERT(where>0 && where < code->size());
  1.1906 +
  1.1907 +    int32_t  nop = URX_BUILD(URX_NOP, 0);
  1.1908 +    code->insertElementAt(nop, where, *fStatus);
  1.1909 +
  1.1910 +    // Walk through the pattern, looking for any ops with targets that
  1.1911 +    //  were moved down by the insert.  Fix them.
  1.1912 +    int32_t loc;
  1.1913 +    for (loc=0; loc<code->size(); loc++) {
  1.1914 +        int32_t op = (int32_t)code->elementAti(loc);
  1.1915 +        int32_t opType = URX_TYPE(op);
  1.1916 +        int32_t opValue = URX_VAL(op);
  1.1917 +        if ((opType == URX_JMP         ||
  1.1918 +            opType == URX_JMPX         ||
  1.1919 +            opType == URX_STATE_SAVE   ||
  1.1920 +            opType == URX_CTR_LOOP     ||
  1.1921 +            opType == URX_CTR_LOOP_NG  ||
  1.1922 +            opType == URX_JMP_SAV      ||
  1.1923 +            opType == URX_JMP_SAV_X    ||
  1.1924 +            opType == URX_RELOC_OPRND)    && opValue > where) {
  1.1925 +            // Target location for this opcode is after the insertion point and
  1.1926 +            //   needs to be incremented to adjust for the insertion.
  1.1927 +            opValue++;
  1.1928 +            op = URX_BUILD(opType, opValue);
  1.1929 +            code->setElementAt(op, loc);
  1.1930 +        }
  1.1931 +    }
  1.1932 +
  1.1933 +    // Now fix up the parentheses stack.  All positive values in it are locations in
  1.1934 +    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
  1.1935 +    for (loc=0; loc<fParenStack.size(); loc++) {
  1.1936 +        int32_t x = fParenStack.elementAti(loc);
  1.1937 +        U_ASSERT(x < code->size());
  1.1938 +        if (x>where) {
  1.1939 +            x++;
  1.1940 +            fParenStack.setElementAt(x, loc);
  1.1941 +        }
  1.1942 +    }
  1.1943 +
  1.1944 +    if (fMatchCloseParen > where) {
  1.1945 +        fMatchCloseParen++;
  1.1946 +    }
  1.1947 +    if (fMatchOpenParen > where) {
  1.1948 +        fMatchOpenParen++;
  1.1949 +    }
  1.1950 +}
  1.1951 +
  1.1952 +
  1.1953 +
  1.1954 +//------------------------------------------------------------------------------
  1.1955 +//
  1.1956 +//   blockTopLoc()          Find or create a location in the compiled pattern
  1.1957 +//                          at the start of the operation or block that has
  1.1958 +//                          just been compiled.  Needed when a quantifier (* or
  1.1959 +//                          whatever) appears, and we need to add an operation
  1.1960 +//                          at the start of the thing being quantified.
  1.1961 +//
  1.1962 +//                          (Parenthesized Blocks) have a slot with a NOP that
  1.1963 +//                          is reserved for this purpose.  .* or similar don't
  1.1964 +//                          and a slot needs to be added.
  1.1965 +//
  1.1966 +//       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
  1.1967 +//                                         at the returned location.
  1.1968 +//                                 FALSE - just return the address,
  1.1969 +//                                         do not reserve a location there.
  1.1970 +//
  1.1971 +//------------------------------------------------------------------------------
  1.1972 +int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
  1.1973 +    int32_t   theLoc;
  1.1974 +    fixLiterals(TRUE);  // Emit code for any pending literals.
  1.1975 +                        //   If last item was a string, emit separate op for the its last char.
  1.1976 +    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
  1.1977 +    {
  1.1978 +        // The item just processed is a parenthesized block.
  1.1979 +        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
  1.1980 +        U_ASSERT(theLoc > 0);
  1.1981 +        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
  1.1982 +    }
  1.1983 +    else {
  1.1984 +        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
  1.1985 +        // No slot for STATE_SAVE was pre-reserved in the compiled code.
  1.1986 +        // We need to make space now.
  1.1987 +        theLoc = fRXPat->fCompiledPat->size()-1;
  1.1988 +        int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
  1.1989 +        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
  1.1990 +            // Strings take two opcode, we want the position of the first one.
  1.1991 +            // We can have a string at this point if a single character case-folded to two.
  1.1992 +            theLoc--;
  1.1993 +        }
  1.1994 +        if (reserveLoc) {
  1.1995 +            int32_t  nop = URX_BUILD(URX_NOP, 0);
  1.1996 +            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
  1.1997 +        }
  1.1998 +    }
  1.1999 +    return theLoc;
  1.2000 +}
  1.2001 +
  1.2002 +
  1.2003 +
  1.2004 +//------------------------------------------------------------------------------
  1.2005 +//
  1.2006 +//    handleCloseParen      When compiling a close paren, we need to go back
  1.2007 +//                          and fix up any JMP or SAVE operations within the
  1.2008 +//                          parenthesized block that need to target the end
  1.2009 +//                          of the block.  The locations of these are kept on
  1.2010 +//                          the paretheses stack.
  1.2011 +//
  1.2012 +//                          This function is called both when encountering a
  1.2013 +//                          real ) and at the end of the pattern.
  1.2014 +//
  1.2015 +//------------------------------------------------------------------------------
  1.2016 +void  RegexCompile::handleCloseParen() {
  1.2017 +    int32_t   patIdx;
  1.2018 +    int32_t   patOp;
  1.2019 +    if (fParenStack.size() <= 0) {
  1.2020 +        error(U_REGEX_MISMATCHED_PAREN);
  1.2021 +        return;
  1.2022 +    }
  1.2023 +
  1.2024 +    // Emit code for any pending literals.
  1.2025 +    fixLiterals(FALSE);
  1.2026 +
  1.2027 +    // Fixup any operations within the just-closed parenthesized group
  1.2028 +    //    that need to reference the end of the (block).
  1.2029 +    //    (The first one popped from the stack is an unused slot for
  1.2030 +    //     alternation (OR) state save, but applying the fixup to it does no harm.)
  1.2031 +    for (;;) {
  1.2032 +        patIdx = fParenStack.popi();
  1.2033 +        if (patIdx < 0) {
  1.2034 +            // value < 0 flags the start of the frame on the paren stack.
  1.2035 +            break;
  1.2036 +        }
  1.2037 +        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
  1.2038 +        patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
  1.2039 +        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
  1.2040 +        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
  1.2041 +        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
  1.2042 +        fMatchOpenParen     = patIdx;
  1.2043 +    }
  1.2044 +
  1.2045 +    //  At the close of any parenthesized block, restore the match mode flags  to
  1.2046 +    //  the value they had at the open paren.  Saved value is
  1.2047 +    //  at the top of the paren stack.
  1.2048 +    fModeFlags = fParenStack.popi();
  1.2049 +    U_ASSERT(fModeFlags < 0);
  1.2050 +
  1.2051 +    // DO any additional fixups, depending on the specific kind of
  1.2052 +    // parentesized grouping this is
  1.2053 +
  1.2054 +    switch (patIdx) {
  1.2055 +    case plain:
  1.2056 +    case flags:
  1.2057 +        // No additional fixups required.
  1.2058 +        //   (Grouping-only parentheses)
  1.2059 +        break;
  1.2060 +    case capturing:
  1.2061 +        // Capturing Parentheses.
  1.2062 +        //   Insert a End Capture op into the pattern.
  1.2063 +        //   The frame offset of the variables for this cg is obtained from the
  1.2064 +        //       start capture op and put it into the end-capture op.
  1.2065 +        {
  1.2066 +            int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
  1.2067 +            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
  1.2068 +
  1.2069 +            int32_t   frameVarLocation = URX_VAL(captureOp);
  1.2070 +            int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
  1.2071 +            fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
  1.2072 +        }
  1.2073 +        break;
  1.2074 +    case atomic:
  1.2075 +        // Atomic Parenthesis.
  1.2076 +        //   Insert a LD_SP operation to restore the state stack to the position
  1.2077 +        //   it was when the atomic parens were entered.
  1.2078 +        {
  1.2079 +            int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
  1.2080 +            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
  1.2081 +            int32_t   stoLoc = URX_VAL(stoOp);
  1.2082 +            int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
  1.2083 +            fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
  1.2084 +        }
  1.2085 +        break;
  1.2086 +
  1.2087 +    case lookAhead:
  1.2088 +        {
  1.2089 +            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
  1.2090 +            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
  1.2091 +            int32_t dataLoc  = URX_VAL(startOp);
  1.2092 +            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
  1.2093 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2094 +        }
  1.2095 +        break;
  1.2096 +
  1.2097 +    case negLookAhead:
  1.2098 +        {
  1.2099 +            // See comment at doOpenLookAheadNeg
  1.2100 +            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
  1.2101 +            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
  1.2102 +            int32_t dataLoc  = URX_VAL(startOp);
  1.2103 +            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
  1.2104 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2105 +            op               = URX_BUILD(URX_BACKTRACK, 0);
  1.2106 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2107 +            op               = URX_BUILD(URX_LA_END, dataLoc);
  1.2108 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2109 +
  1.2110 +            // Patch the URX_SAVE near the top of the block.
  1.2111 +            // The destination of the SAVE is the final LA_END that was just added.
  1.2112 +            int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
  1.2113 +            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
  1.2114 +            int32_t dest     = fRXPat->fCompiledPat->size()-1;
  1.2115 +            saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
  1.2116 +            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
  1.2117 +        }
  1.2118 +        break;
  1.2119 +
  1.2120 +    case lookBehind:
  1.2121 +        {
  1.2122 +            // See comment at doOpenLookBehind.
  1.2123 +
  1.2124 +            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
  1.2125 +            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
  1.2126 +            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
  1.2127 +            int32_t dataLoc  = URX_VAL(startOp);
  1.2128 +            int32_t op       = URX_BUILD(URX_LB_END, dataLoc);
  1.2129 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2130 +                    op       = URX_BUILD(URX_LA_END, dataLoc);
  1.2131 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2132 +
  1.2133 +            // Determine the min and max bounds for the length of the
  1.2134 +            //  string that the pattern can match.
  1.2135 +            //  An unbounded upper limit is an error.
  1.2136 +            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
  1.2137 +            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
  1.2138 +            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
  1.2139 +            if (maxML == INT32_MAX) {
  1.2140 +                error(U_REGEX_LOOK_BEHIND_LIMIT);
  1.2141 +                break;
  1.2142 +            }
  1.2143 +            U_ASSERT(minML <= maxML);
  1.2144 +
  1.2145 +            // Insert the min and max match len bounds into the URX_LB_CONT op that
  1.2146 +            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
  1.2147 +            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
  1.2148 +            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
  1.2149 +
  1.2150 +        }
  1.2151 +        break;
  1.2152 +
  1.2153 +
  1.2154 +
  1.2155 +    case lookBehindN:
  1.2156 +        {
  1.2157 +            // See comment at doOpenLookBehindNeg.
  1.2158 +
  1.2159 +            // Append the URX_LBN_END to the compiled pattern.
  1.2160 +            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
  1.2161 +            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
  1.2162 +            int32_t dataLoc  = URX_VAL(startOp);
  1.2163 +            int32_t op       = URX_BUILD(URX_LBN_END, dataLoc);
  1.2164 +            fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2165 +
  1.2166 +            // Determine the min and max bounds for the length of the
  1.2167 +            //  string that the pattern can match.
  1.2168 +            //  An unbounded upper limit is an error.
  1.2169 +            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
  1.2170 +            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
  1.2171 +            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
  1.2172 +            if (maxML == INT32_MAX) {
  1.2173 +                error(U_REGEX_LOOK_BEHIND_LIMIT);
  1.2174 +                break;
  1.2175 +            }
  1.2176 +            U_ASSERT(minML <= maxML);
  1.2177 +
  1.2178 +            // Insert the min and max match len bounds into the URX_LB_CONT op that
  1.2179 +            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
  1.2180 +            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
  1.2181 +            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
  1.2182 +
  1.2183 +            // Insert the pattern location to continue at after a successful match
  1.2184 +            //  as the last operand of the URX_LBN_CONT
  1.2185 +            op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
  1.2186 +            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
  1.2187 +        }
  1.2188 +        break;
  1.2189 +
  1.2190 +
  1.2191 +
  1.2192 +    default:
  1.2193 +        U_ASSERT(FALSE);
  1.2194 +    }
  1.2195 +
  1.2196 +    // remember the next location in the compiled pattern.
  1.2197 +    // The compilation of Quantifiers will look at this to see whether its looping
  1.2198 +    //   over a parenthesized block or a single item
  1.2199 +    fMatchCloseParen = fRXPat->fCompiledPat->size();
  1.2200 +}
  1.2201 +
  1.2202 +
  1.2203 +
  1.2204 +//------------------------------------------------------------------------------
  1.2205 +//
  1.2206 +//   compileSet       Compile the pattern operations for a reference to a
  1.2207 +//                    UnicodeSet.
  1.2208 +//
  1.2209 +//------------------------------------------------------------------------------
  1.2210 +void        RegexCompile::compileSet(UnicodeSet *theSet)
  1.2211 +{
  1.2212 +    if (theSet == NULL) {
  1.2213 +        return;
  1.2214 +    }
  1.2215 +    //  Remove any strings from the set.
  1.2216 +    //  There shoudn't be any, but just in case.
  1.2217 +    //     (Case Closure can add them; if we had a simple case closure avaialble that
  1.2218 +    //      ignored strings, that would be better.)
  1.2219 +    theSet->removeAllStrings();
  1.2220 +    int32_t  setSize = theSet->size();
  1.2221 +
  1.2222 +    switch (setSize) {
  1.2223 +    case 0:
  1.2224 +        {
  1.2225 +            // Set of no elements.   Always fails to match.
  1.2226 +            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
  1.2227 +            delete theSet;
  1.2228 +        }
  1.2229 +        break;
  1.2230 +
  1.2231 +    case 1:
  1.2232 +        {
  1.2233 +            // The set contains only a single code point.  Put it into
  1.2234 +            //   the compiled pattern as a single char operation rather
  1.2235 +            //   than a set, and discard the set itself.
  1.2236 +            literalChar(theSet->charAt(0));
  1.2237 +            delete theSet;
  1.2238 +        }
  1.2239 +        break;
  1.2240 +
  1.2241 +    default:
  1.2242 +        {
  1.2243 +            //  The set contains two or more chars.  (the normal case)
  1.2244 +            //  Put it into the compiled pattern as a set.
  1.2245 +            int32_t setNumber = fRXPat->fSets->size();
  1.2246 +            fRXPat->fSets->addElement(theSet, *fStatus);
  1.2247 +            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
  1.2248 +            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
  1.2249 +        }
  1.2250 +    }
  1.2251 +}
  1.2252 +
  1.2253 +
  1.2254 +//------------------------------------------------------------------------------
  1.2255 +//
  1.2256 +//   compileInterval    Generate the code for a {min, max} style interval quantifier.
  1.2257 +//                      Except for the specific opcodes used, the code is the same
  1.2258 +//                      for all three types (greedy, non-greedy, possessive) of
  1.2259 +//                      intervals.  The opcodes are supplied as parameters.
  1.2260 +//                      (There are two sets of opcodes - greedy & possessive use the
  1.2261 +//                      same ones, while non-greedy has it's own.)
  1.2262 +//
  1.2263 +//                      The code for interval loops has this form:
  1.2264 +//                         0  CTR_INIT   counter loc (in stack frame)
  1.2265 +//                         1             5  patt address of CTR_LOOP at bottom of block
  1.2266 +//                         2             min count
  1.2267 +//                         3             max count   (-1 for unbounded)
  1.2268 +//                         4  ...        block to be iterated over
  1.2269 +//                         5  CTR_LOOP
  1.2270 +//
  1.2271 +//                       In
  1.2272 +//------------------------------------------------------------------------------
  1.2273 +void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
  1.2274 +{
  1.2275 +    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
  1.2276 +    //   four slots in the compiled code.  Reserve them.
  1.2277 +    int32_t   topOfBlock = blockTopLoc(TRUE);
  1.2278 +    insertOp(topOfBlock);
  1.2279 +    insertOp(topOfBlock);
  1.2280 +    insertOp(topOfBlock);
  1.2281 +
  1.2282 +    // The operands for the CTR_INIT opcode include the index in the matcher data
  1.2283 +    //   of the counter.  Allocate it now. There are two data items
  1.2284 +    //        counterLoc   -->  Loop counter
  1.2285 +    //               +1    -->  Input index (for breaking non-progressing loops)
  1.2286 +    //                          (Only present if unbounded upper limit on loop)
  1.2287 +    int32_t   counterLoc = fRXPat->fFrameSize;
  1.2288 +    fRXPat->fFrameSize++;
  1.2289 +    if (fIntervalUpper < 0) {
  1.2290 +        fRXPat->fFrameSize++;
  1.2291 +    }
  1.2292 +
  1.2293 +    int32_t   op = URX_BUILD(InitOp, counterLoc);
  1.2294 +    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
  1.2295 +
  1.2296 +    // The second operand of CTR_INIT is the location following the end of the loop.
  1.2297 +    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
  1.2298 +    //   compilation of something later on causes the code to grow and the target
  1.2299 +    //   position to move.
  1.2300 +    int32_t loopEnd = fRXPat->fCompiledPat->size();
  1.2301 +    op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
  1.2302 +    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
  1.2303 +
  1.2304 +    // Followed by the min and max counts.
  1.2305 +    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
  1.2306 +    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
  1.2307 +
  1.2308 +    // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
  1.2309 +    //   Goes at end of the block being looped over, so just append to the code so far.
  1.2310 +    op = URX_BUILD(LoopOp, topOfBlock);
  1.2311 +    fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2312 +
  1.2313 +    if ((fIntervalLow & 0xff000000) != 0 ||
  1.2314 +        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
  1.2315 +            error(U_REGEX_NUMBER_TOO_BIG);
  1.2316 +        }
  1.2317 +
  1.2318 +    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
  1.2319 +        error(U_REGEX_MAX_LT_MIN);
  1.2320 +    }
  1.2321 +}
  1.2322 +
  1.2323 +
  1.2324 +
  1.2325 +UBool RegexCompile::compileInlineInterval() {
  1.2326 +    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
  1.2327 +        // Too big to inline.  Fail, which will cause looping code to be generated.
  1.2328 +        //   (Upper < Lower picks up unbounded upper and errors, both.)
  1.2329 +        return FALSE;
  1.2330 +    }
  1.2331 +
  1.2332 +    int32_t   topOfBlock = blockTopLoc(FALSE);
  1.2333 +    if (fIntervalUpper == 0) {
  1.2334 +        // Pathological case.  Attempt no matches, as if the block doesn't exist.
  1.2335 +        fRXPat->fCompiledPat->setSize(topOfBlock);
  1.2336 +        return TRUE;
  1.2337 +    }
  1.2338 +
  1.2339 +    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
  1.2340 +        // The thing being repeated is not a single op, but some
  1.2341 +        //   more complex block.  Do it as a loop, not inlines.
  1.2342 +        //   Note that things "repeated" a max of once are handled as inline, because
  1.2343 +        //     the one copy of the code already generated is just fine.
  1.2344 +        return FALSE;
  1.2345 +    }
  1.2346 +
  1.2347 +    // Pick up the opcode that is to be repeated
  1.2348 +    //
  1.2349 +    int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
  1.2350 +
  1.2351 +    // Compute the pattern location where the inline sequence
  1.2352 +    //   will end, and set up the state save op that will be needed.
  1.2353 +    //
  1.2354 +    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
  1.2355 +                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
  1.2356 +    int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
  1.2357 +    if (fIntervalLow == 0) {
  1.2358 +        insertOp(topOfBlock);
  1.2359 +        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
  1.2360 +    }
  1.2361 +
  1.2362 +
  1.2363 +
  1.2364 +    //  Loop, emitting the op for the thing being repeated each time.
  1.2365 +    //    Loop starts at 1 because one instance of the op already exists in the pattern,
  1.2366 +    //    it was put there when it was originally encountered.
  1.2367 +    int32_t i;
  1.2368 +    for (i=1; i<fIntervalUpper; i++ ) {
  1.2369 +        if (i == fIntervalLow) {
  1.2370 +            fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
  1.2371 +        }
  1.2372 +        if (i > fIntervalLow) {
  1.2373 +            fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
  1.2374 +        }
  1.2375 +        fRXPat->fCompiledPat->addElement(op, *fStatus);
  1.2376 +    }
  1.2377 +    return TRUE;
  1.2378 +}
  1.2379 +
  1.2380 +
  1.2381 +
  1.2382 +//------------------------------------------------------------------------------
  1.2383 +//
  1.2384 +//   matchStartType    Determine how a match can start.
  1.2385 +//                     Used to optimize find() operations.
  1.2386 +//
  1.2387 +//                     Operation is very similar to minMatchLength().  Walk the compiled
  1.2388 +//                     pattern, keeping an on-going minimum-match-length.  For any
  1.2389 +//                     op where the min match coming in is zero, add that ops possible
  1.2390 +//                     starting matches to the possible starts for the overall pattern.
  1.2391 +//
  1.2392 +//------------------------------------------------------------------------------
  1.2393 +void   RegexCompile::matchStartType() {
  1.2394 +    if (U_FAILURE(*fStatus)) {
  1.2395 +        return;
  1.2396 +    }
  1.2397 +
  1.2398 +
  1.2399 +    int32_t    loc;                    // Location in the pattern of the current op being processed.
  1.2400 +    int32_t    op;                     // The op being processed
  1.2401 +    int32_t    opType;                 // The opcode type of the op
  1.2402 +    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
  1.2403 +    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
  1.2404 +
  1.2405 +    UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
  1.2406 +                                       //   could have advanced the position in a match.
  1.2407 +                                       //   (Maximum match length so far == 0)
  1.2408 +
  1.2409 +    // forwardedLength is a vector holding minimum-match-length values that
  1.2410 +    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
  1.2411 +    //   It must be one longer than the pattern being checked because some  ops
  1.2412 +    //   will jmp to a end-of-block+1 location from within a block, and we must
  1.2413 +    //   count those when checking the block.
  1.2414 +    int32_t end = fRXPat->fCompiledPat->size();
  1.2415 +    UVector32  forwardedLength(end+1, *fStatus);
  1.2416 +    forwardedLength.setSize(end+1);
  1.2417 +    for (loc=3; loc<end; loc++) {
  1.2418 +        forwardedLength.setElementAt(INT32_MAX, loc);
  1.2419 +    }
  1.2420 +
  1.2421 +    for (loc = 3; loc<end; loc++) {
  1.2422 +        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.2423 +        opType = URX_TYPE(op);
  1.2424 +
  1.2425 +        // The loop is advancing linearly through the pattern.
  1.2426 +        // If the op we are now at was the destination of a branch in the pattern,
  1.2427 +        // and that path has a shorter minimum length than the current accumulated value,
  1.2428 +        // replace the current accumulated value.
  1.2429 +        if (forwardedLength.elementAti(loc) < currentLen) {
  1.2430 +            currentLen = forwardedLength.elementAti(loc);
  1.2431 +            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
  1.2432 +        }
  1.2433 +
  1.2434 +        switch (opType) {
  1.2435 +            // Ops that don't change the total length matched
  1.2436 +        case URX_RESERVED_OP:
  1.2437 +        case URX_END:
  1.2438 +        case URX_FAIL:
  1.2439 +        case URX_STRING_LEN:
  1.2440 +        case URX_NOP:
  1.2441 +        case URX_START_CAPTURE:
  1.2442 +        case URX_END_CAPTURE:
  1.2443 +        case URX_BACKSLASH_B:
  1.2444 +        case URX_BACKSLASH_BU:
  1.2445 +        case URX_BACKSLASH_G:
  1.2446 +        case URX_BACKSLASH_Z:
  1.2447 +        case URX_DOLLAR:
  1.2448 +        case URX_DOLLAR_M:
  1.2449 +        case URX_DOLLAR_D:
  1.2450 +        case URX_DOLLAR_MD:
  1.2451 +        case URX_RELOC_OPRND:
  1.2452 +        case URX_STO_INP_LOC:
  1.2453 +        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
  1.2454 +        case URX_BACKREF_I:
  1.2455 +                
  1.2456 +        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
  1.2457 +        case URX_LD_SP:
  1.2458 +            break;
  1.2459 +
  1.2460 +        case URX_CARET:
  1.2461 +            if (atStart) {
  1.2462 +                fRXPat->fStartType = START_START;
  1.2463 +            }
  1.2464 +            break;
  1.2465 +
  1.2466 +        case URX_CARET_M:
  1.2467 +        case URX_CARET_M_UNIX:
  1.2468 +            if (atStart) {
  1.2469 +                fRXPat->fStartType = START_LINE;
  1.2470 +            }
  1.2471 +            break;
  1.2472 +
  1.2473 +        case URX_ONECHAR:
  1.2474 +            if (currentLen == 0) {
  1.2475 +                // This character could appear at the start of a match.
  1.2476 +                //   Add it to the set of possible starting characters.
  1.2477 +                fRXPat->fInitialChars->add(URX_VAL(op));
  1.2478 +                numInitialStrings += 2;
  1.2479 +            }
  1.2480 +            currentLen++;
  1.2481 +            atStart = FALSE;
  1.2482 +            break;
  1.2483 +
  1.2484 +
  1.2485 +        case URX_SETREF:
  1.2486 +            if (currentLen == 0) {
  1.2487 +                int32_t  sn = URX_VAL(op);
  1.2488 +                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
  1.2489 +                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
  1.2490 +                fRXPat->fInitialChars->addAll(*s);
  1.2491 +                numInitialStrings += 2;
  1.2492 +            }
  1.2493 +            currentLen++;
  1.2494 +            atStart = FALSE;
  1.2495 +            break;
  1.2496 +
  1.2497 +        case URX_LOOP_SR_I:
  1.2498 +            // [Set]*, like a SETREF, above, in what it can match,
  1.2499 +            //  but may not match at all, so currentLen is not incremented.
  1.2500 +            if (currentLen == 0) {
  1.2501 +                int32_t  sn = URX_VAL(op);
  1.2502 +                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
  1.2503 +                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
  1.2504 +                fRXPat->fInitialChars->addAll(*s);
  1.2505 +                numInitialStrings += 2;
  1.2506 +            }
  1.2507 +            atStart = FALSE;
  1.2508 +            break;
  1.2509 +
  1.2510 +        case URX_LOOP_DOT_I:
  1.2511 +            if (currentLen == 0) {
  1.2512 +                // .* at the start of a pattern.
  1.2513 +                //    Any character can begin the match.
  1.2514 +                fRXPat->fInitialChars->clear();
  1.2515 +                fRXPat->fInitialChars->complement();
  1.2516 +                numInitialStrings += 2;
  1.2517 +            }
  1.2518 +            atStart = FALSE;
  1.2519 +            break;
  1.2520 +
  1.2521 +
  1.2522 +        case URX_STATIC_SETREF:
  1.2523 +            if (currentLen == 0) {
  1.2524 +                int32_t  sn = URX_VAL(op);
  1.2525 +                U_ASSERT(sn>0 && sn<URX_LAST_SET);
  1.2526 +                const UnicodeSet *s = fRXPat->fStaticSets[sn];
  1.2527 +                fRXPat->fInitialChars->addAll(*s);
  1.2528 +                numInitialStrings += 2;
  1.2529 +            }
  1.2530 +            currentLen++;
  1.2531 +            atStart = FALSE;
  1.2532 +            break;
  1.2533 +
  1.2534 +
  1.2535 +
  1.2536 +        case URX_STAT_SETREF_N:
  1.2537 +            if (currentLen == 0) {
  1.2538 +                int32_t  sn = URX_VAL(op);
  1.2539 +                const UnicodeSet *s = fRXPat->fStaticSets[sn];
  1.2540 +                UnicodeSet sc(*s);
  1.2541 +                sc.complement();
  1.2542 +                fRXPat->fInitialChars->addAll(sc);
  1.2543 +                numInitialStrings += 2;
  1.2544 +            }
  1.2545 +            currentLen++;
  1.2546 +            atStart = FALSE;
  1.2547 +            break;
  1.2548 +
  1.2549 +
  1.2550 +
  1.2551 +        case URX_BACKSLASH_D:
  1.2552 +            // Digit Char
  1.2553 +             if (currentLen == 0) {
  1.2554 +                 UnicodeSet s;
  1.2555 +                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
  1.2556 +                 if (URX_VAL(op) != 0) {
  1.2557 +                     s.complement();
  1.2558 +                 }
  1.2559 +                 fRXPat->fInitialChars->addAll(s);
  1.2560 +                 numInitialStrings += 2;
  1.2561 +            }
  1.2562 +            currentLen++;
  1.2563 +            atStart = FALSE;
  1.2564 +            break;
  1.2565 +
  1.2566 +
  1.2567 +        case URX_ONECHAR_I:
  1.2568 +            // Case Insensitive Single Character.
  1.2569 +            if (currentLen == 0) {
  1.2570 +                UChar32  c = URX_VAL(op);
  1.2571 +                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
  1.2572 +
  1.2573 +                    // Disable optimizations on first char of match.
  1.2574 +                    // TODO: Compute the set of chars that case fold to this char, or to
  1.2575 +                    //       a string that begins with this char.
  1.2576 +                    //       For simple case folding, this code worked:
  1.2577 +                    //   UnicodeSet s(c, c);
  1.2578 +                    //   s.closeOver(USET_CASE_INSENSITIVE);
  1.2579 +                    //   fRXPat->fInitialChars->addAll(s);
  1.2580 +
  1.2581 +                    fRXPat->fInitialChars->clear();
  1.2582 +                    fRXPat->fInitialChars->complement();
  1.2583 +                } else {
  1.2584 +                    // Char has no case variants.  Just add it as-is to the
  1.2585 +                    //   set of possible starting chars.
  1.2586 +                    fRXPat->fInitialChars->add(c);
  1.2587 +                }
  1.2588 +                numInitialStrings += 2;
  1.2589 +            }
  1.2590 +            currentLen++;
  1.2591 +            atStart = FALSE;
  1.2592 +            break;
  1.2593 +
  1.2594 +
  1.2595 +        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
  1.2596 +        case URX_DOTANY_ALL:    // . matches one or two.
  1.2597 +        case URX_DOTANY:
  1.2598 +        case URX_DOTANY_UNIX:
  1.2599 +            if (currentLen == 0) {
  1.2600 +                // These constructs are all bad news when they appear at the start
  1.2601 +                //   of a match.  Any character can begin the match.
  1.2602 +                fRXPat->fInitialChars->clear();
  1.2603 +                fRXPat->fInitialChars->complement();
  1.2604 +                numInitialStrings += 2;
  1.2605 +            }
  1.2606 +            currentLen++;
  1.2607 +            atStart = FALSE;
  1.2608 +            break;
  1.2609 +
  1.2610 +
  1.2611 +        case URX_JMPX:
  1.2612 +            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
  1.2613 +        case URX_JMP:
  1.2614 +            {
  1.2615 +                int32_t  jmpDest = URX_VAL(op);
  1.2616 +                if (jmpDest < loc) {
  1.2617 +                    // Loop of some kind.  Can safely ignore, the worst that will happen
  1.2618 +                    //  is that we understate the true minimum length
  1.2619 +                    currentLen = forwardedLength.elementAti(loc+1);
  1.2620 +
  1.2621 +                } else {
  1.2622 +                    // Forward jump.  Propagate the current min length to the target loc of the jump.
  1.2623 +                    U_ASSERT(jmpDest <= end+1);
  1.2624 +                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
  1.2625 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.2626 +                    }
  1.2627 +                }
  1.2628 +            }
  1.2629 +            atStart = FALSE;
  1.2630 +            break;
  1.2631 +
  1.2632 +        case URX_JMP_SAV:
  1.2633 +        case URX_JMP_SAV_X:
  1.2634 +            // Combo of state save to the next loc, + jmp backwards.
  1.2635 +            //   Net effect on min. length computation is nothing.
  1.2636 +            atStart = FALSE;
  1.2637 +            break;
  1.2638 +
  1.2639 +        case URX_BACKTRACK:
  1.2640 +            // Fails are kind of like a branch, except that the min length was
  1.2641 +            //   propagated already, by the state save.
  1.2642 +            currentLen = forwardedLength.elementAti(loc+1);
  1.2643 +            atStart = FALSE;
  1.2644 +            break;
  1.2645 +
  1.2646 +
  1.2647 +        case URX_STATE_SAVE:
  1.2648 +            {
  1.2649 +                // State Save, for forward jumps, propagate the current minimum.
  1.2650 +                //             of the state save.
  1.2651 +                int32_t  jmpDest = URX_VAL(op);
  1.2652 +                if (jmpDest > loc) {
  1.2653 +                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
  1.2654 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.2655 +                    }
  1.2656 +                }
  1.2657 +            }
  1.2658 +            atStart = FALSE;
  1.2659 +            break;
  1.2660 +
  1.2661 +
  1.2662 +
  1.2663 +
  1.2664 +        case URX_STRING:
  1.2665 +            {
  1.2666 +                loc++;
  1.2667 +                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.2668 +                int32_t stringLen   = URX_VAL(stringLenOp);
  1.2669 +                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
  1.2670 +                U_ASSERT(stringLenOp >= 2);
  1.2671 +                if (currentLen == 0) {
  1.2672 +                    // Add the starting character of this string to the set of possible starting
  1.2673 +                    //   characters for this pattern.
  1.2674 +                    int32_t stringStartIdx = URX_VAL(op);
  1.2675 +                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
  1.2676 +                    fRXPat->fInitialChars->add(c);
  1.2677 +
  1.2678 +                    // Remember this string.  After the entire pattern has been checked,
  1.2679 +                    //  if nothing else is identified that can start a match, we'll use it.
  1.2680 +                    numInitialStrings++;
  1.2681 +                    fRXPat->fInitialStringIdx = stringStartIdx;
  1.2682 +                    fRXPat->fInitialStringLen = stringLen;
  1.2683 +                }
  1.2684 +
  1.2685 +                currentLen += stringLen;
  1.2686 +                atStart = FALSE;
  1.2687 +            }
  1.2688 +            break;
  1.2689 +
  1.2690 +        case URX_STRING_I:
  1.2691 +            {
  1.2692 +                // Case-insensitive string.  Unlike exact-match strings, we won't
  1.2693 +                //   attempt a string search for possible match positions.  But we
  1.2694 +                //   do update the set of possible starting characters.
  1.2695 +                loc++;
  1.2696 +                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.2697 +                int32_t stringLen   = URX_VAL(stringLenOp);
  1.2698 +                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
  1.2699 +                U_ASSERT(stringLenOp >= 2);
  1.2700 +                if (currentLen == 0) {
  1.2701 +                    // Add the starting character of this string to the set of possible starting
  1.2702 +                    //   characters for this pattern.
  1.2703 +                    int32_t stringStartIdx = URX_VAL(op);
  1.2704 +                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
  1.2705 +                    UnicodeSet s(c, c);
  1.2706 +
  1.2707 +                    // TODO:  compute correct set of starting chars for full case folding.
  1.2708 +                    //        For the moment, say any char can start.
  1.2709 +                    // s.closeOver(USET_CASE_INSENSITIVE);
  1.2710 +                    s.clear();
  1.2711 +                    s.complement();
  1.2712 +
  1.2713 +                    fRXPat->fInitialChars->addAll(s);
  1.2714 +                    numInitialStrings += 2;  // Matching on an initial string not possible.
  1.2715 +                }
  1.2716 +                currentLen += stringLen;
  1.2717 +                atStart = FALSE;
  1.2718 +            }
  1.2719 +            break;
  1.2720 +
  1.2721 +        case URX_CTR_INIT:
  1.2722 +        case URX_CTR_INIT_NG:
  1.2723 +            {
  1.2724 +                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
  1.2725 +                //   so location must be updated accordingly.
  1.2726 +                // Loop Init Ops.
  1.2727 +                //   If the min loop count == 0
  1.2728 +                //      move loc forwards to the end of the loop, skipping over the body.
  1.2729 +                //   If the min count is > 0,
  1.2730 +                //      continue normal processing of the body of the loop.
  1.2731 +                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
  1.2732 +                        loopEndLoc   = URX_VAL(loopEndLoc);
  1.2733 +                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
  1.2734 +                if (minLoopCount == 0) {
  1.2735 +                    // Min Loop Count of 0, treat like a forward branch and
  1.2736 +                    //   move the current minimum length up to the target
  1.2737 +                    //   (end of loop) location.
  1.2738 +                    U_ASSERT(loopEndLoc <= end+1);
  1.2739 +                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
  1.2740 +                        forwardedLength.setElementAt(currentLen, loopEndLoc);
  1.2741 +                    }
  1.2742 +                }
  1.2743 +                loc+=3;  // Skips over operands of CTR_INIT
  1.2744 +            }
  1.2745 +            atStart = FALSE;
  1.2746 +            break;
  1.2747 +
  1.2748 +
  1.2749 +        case URX_CTR_LOOP:
  1.2750 +        case URX_CTR_LOOP_NG:
  1.2751 +            // Loop ops.
  1.2752 +            //  The jump is conditional, backwards only.
  1.2753 +            atStart = FALSE;
  1.2754 +            break;
  1.2755 +
  1.2756 +        case URX_LOOP_C:
  1.2757 +            // More loop ops.  These state-save to themselves.
  1.2758 +            //   don't change the minimum match
  1.2759 +            atStart = FALSE;
  1.2760 +            break;
  1.2761 +
  1.2762 +
  1.2763 +        case URX_LA_START:
  1.2764 +        case URX_LB_START:
  1.2765 +            {
  1.2766 +                // Look-around.  Scan forward until the matching look-ahead end,
  1.2767 +                //   without processing the look-around block.  This is overly pessimistic.
  1.2768 +                
  1.2769 +                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
  1.2770 +                //   lookahead contains two LA_END instructions, so count goes up by two
  1.2771 +                //   for each LA_START.
  1.2772 +                int32_t  depth = (opType == URX_LA_START? 2: 1);
  1.2773 +                for (;;) {
  1.2774 +                    loc++;
  1.2775 +                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.2776 +                    if (URX_TYPE(op) == URX_LA_START) {
  1.2777 +                        depth+=2;
  1.2778 +                    }
  1.2779 +                    if (URX_TYPE(op) == URX_LB_START) {
  1.2780 +                        depth++;
  1.2781 +                    }
  1.2782 +                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
  1.2783 +                        depth--;
  1.2784 +                        if (depth == 0) {
  1.2785 +                            break;
  1.2786 +                        }
  1.2787 +                    }
  1.2788 +                    if (URX_TYPE(op) == URX_STATE_SAVE) {
  1.2789 +                        // Need this because neg lookahead blocks will FAIL to outside
  1.2790 +                        //   of the block.
  1.2791 +                        int32_t  jmpDest = URX_VAL(op);
  1.2792 +                        if (jmpDest > loc) {
  1.2793 +                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
  1.2794 +                                forwardedLength.setElementAt(currentLen, jmpDest);
  1.2795 +                            }
  1.2796 +                        }
  1.2797 +                    }
  1.2798 +                    U_ASSERT(loc <= end);
  1.2799 +                }
  1.2800 +            }
  1.2801 +            break;
  1.2802 +
  1.2803 +        case URX_LA_END:
  1.2804 +        case URX_LB_CONT:
  1.2805 +        case URX_LB_END:
  1.2806 +        case URX_LBN_CONT:
  1.2807 +        case URX_LBN_END:
  1.2808 +            U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
  1.2809 +                                 //  consumed by the scan in URX_LA_START and LB_START
  1.2810 +
  1.2811 +            break;
  1.2812 +
  1.2813 +        default:
  1.2814 +            U_ASSERT(FALSE);
  1.2815 +            }
  1.2816 +
  1.2817 +        }
  1.2818 +
  1.2819 +
  1.2820 +    // We have finished walking through the ops.  Check whether some forward jump
  1.2821 +    //   propagated a shorter length to location end+1.
  1.2822 +    if (forwardedLength.elementAti(end+1) < currentLen) {
  1.2823 +        currentLen = forwardedLength.elementAti(end+1);
  1.2824 +    }
  1.2825 +
  1.2826 +
  1.2827 +    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
  1.2828 +
  1.2829 +
  1.2830 +    // Sort out what we should check for when looking for candidate match start positions.
  1.2831 +    // In order of preference,
  1.2832 +    //     1.   Start of input text buffer.
  1.2833 +    //     2.   A literal string.
  1.2834 +    //     3.   Start of line in multi-line mode.
  1.2835 +    //     4.   A single literal character.
  1.2836 +    //     5.   A character from a set of characters.
  1.2837 +    //
  1.2838 +    if (fRXPat->fStartType == START_START) {
  1.2839 +        // Match only at the start of an input text string.
  1.2840 +        //    start type is already set.  We're done.
  1.2841 +    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
  1.2842 +        // Match beginning only with a literal string.
  1.2843 +        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
  1.2844 +        U_ASSERT(fRXPat->fInitialChars->contains(c));
  1.2845 +        fRXPat->fStartType   = START_STRING;
  1.2846 +        fRXPat->fInitialChar = c;
  1.2847 +    } else if (fRXPat->fStartType == START_LINE) {
  1.2848 +        // Match at start of line in Multi-Line mode.
  1.2849 +        // Nothing to do here; everything is already set.
  1.2850 +    } else if (fRXPat->fMinMatchLen == 0) {
  1.2851 +        // Zero length match possible.  We could start anywhere.
  1.2852 +        fRXPat->fStartType = START_NO_INFO;
  1.2853 +    } else if (fRXPat->fInitialChars->size() == 1) {
  1.2854 +        // All matches begin with the same char.
  1.2855 +        fRXPat->fStartType   = START_CHAR;
  1.2856 +        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
  1.2857 +        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
  1.2858 +    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
  1.2859 +        fRXPat->fMinMatchLen > 0) {
  1.2860 +        // Matches start with a set of character smaller than the set of all chars.
  1.2861 +        fRXPat->fStartType = START_SET;
  1.2862 +    } else {
  1.2863 +        // Matches can start with anything
  1.2864 +        fRXPat->fStartType = START_NO_INFO;
  1.2865 +    }
  1.2866 +
  1.2867 +    return;
  1.2868 +}
  1.2869 +
  1.2870 +
  1.2871 +
  1.2872 +//------------------------------------------------------------------------------
  1.2873 +//
  1.2874 +//   minMatchLength    Calculate the length of the shortest string that could
  1.2875 +//                     match the specified pattern.
  1.2876 +//                     Length is in 16 bit code units, not code points.
  1.2877 +//
  1.2878 +//                     The calculated length may not be exact.  The returned
  1.2879 +//                     value may be shorter than the actual minimum; it must
  1.2880 +//                     never be longer.
  1.2881 +//
  1.2882 +//                     start and end are the range of p-code operations to be
  1.2883 +//                     examined.  The endpoints are included in the range.
  1.2884 +//
  1.2885 +//------------------------------------------------------------------------------
  1.2886 +int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
  1.2887 +    if (U_FAILURE(*fStatus)) {
  1.2888 +        return 0;
  1.2889 +    }
  1.2890 +
  1.2891 +    U_ASSERT(start <= end);
  1.2892 +    U_ASSERT(end < fRXPat->fCompiledPat->size());
  1.2893 +
  1.2894 +
  1.2895 +    int32_t    loc;
  1.2896 +    int32_t    op;
  1.2897 +    int32_t    opType;
  1.2898 +    int32_t    currentLen = 0;
  1.2899 +
  1.2900 +
  1.2901 +    // forwardedLength is a vector holding minimum-match-length values that
  1.2902 +    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
  1.2903 +    //   It must be one longer than the pattern being checked because some  ops
  1.2904 +    //   will jmp to a end-of-block+1 location from within a block, and we must
  1.2905 +    //   count those when checking the block.
  1.2906 +    UVector32  forwardedLength(end+2, *fStatus);
  1.2907 +    forwardedLength.setSize(end+2);
  1.2908 +    for (loc=start; loc<=end+1; loc++) {
  1.2909 +        forwardedLength.setElementAt(INT32_MAX, loc);
  1.2910 +    }
  1.2911 +
  1.2912 +    for (loc = start; loc<=end; loc++) {
  1.2913 +        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.2914 +        opType = URX_TYPE(op);
  1.2915 +
  1.2916 +        // The loop is advancing linearly through the pattern.
  1.2917 +        // If the op we are now at was the destination of a branch in the pattern,
  1.2918 +        // and that path has a shorter minimum length than the current accumulated value,
  1.2919 +        // replace the current accumulated value.
  1.2920 +        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
  1.2921 +                                                               //   no-match-possible cases.
  1.2922 +        if (forwardedLength.elementAti(loc) < currentLen) {
  1.2923 +            currentLen = forwardedLength.elementAti(loc);
  1.2924 +            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
  1.2925 +        }
  1.2926 +
  1.2927 +        switch (opType) {
  1.2928 +            // Ops that don't change the total length matched
  1.2929 +        case URX_RESERVED_OP:
  1.2930 +        case URX_END:
  1.2931 +        case URX_STRING_LEN:
  1.2932 +        case URX_NOP:
  1.2933 +        case URX_START_CAPTURE:
  1.2934 +        case URX_END_CAPTURE:
  1.2935 +        case URX_BACKSLASH_B:
  1.2936 +        case URX_BACKSLASH_BU:
  1.2937 +        case URX_BACKSLASH_G:
  1.2938 +        case URX_BACKSLASH_Z:
  1.2939 +        case URX_CARET:
  1.2940 +        case URX_DOLLAR:
  1.2941 +        case URX_DOLLAR_M:
  1.2942 +        case URX_DOLLAR_D:
  1.2943 +        case URX_DOLLAR_MD:
  1.2944 +        case URX_RELOC_OPRND:
  1.2945 +        case URX_STO_INP_LOC:
  1.2946 +        case URX_CARET_M:
  1.2947 +        case URX_CARET_M_UNIX:
  1.2948 +        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
  1.2949 +        case URX_BACKREF_I:
  1.2950 +
  1.2951 +        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
  1.2952 +        case URX_LD_SP:
  1.2953 +
  1.2954 +        case URX_JMP_SAV:
  1.2955 +        case URX_JMP_SAV_X:
  1.2956 +            break;
  1.2957 +
  1.2958 +
  1.2959 +            // Ops that match a minimum of one character (one or two 16 bit code units.)
  1.2960 +            //
  1.2961 +        case URX_ONECHAR:
  1.2962 +        case URX_STATIC_SETREF:
  1.2963 +        case URX_STAT_SETREF_N:
  1.2964 +        case URX_SETREF:
  1.2965 +        case URX_BACKSLASH_D:
  1.2966 +        case URX_ONECHAR_I:
  1.2967 +        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
  1.2968 +        case URX_DOTANY_ALL:    // . matches one or two.
  1.2969 +        case URX_DOTANY:
  1.2970 +        case URX_DOTANY_UNIX:
  1.2971 +            currentLen++;
  1.2972 +            break;
  1.2973 +
  1.2974 +
  1.2975 +        case URX_JMPX:
  1.2976 +            loc++;              // URX_JMPX has an extra operand, ignored here,
  1.2977 +                                //   otherwise processed identically to URX_JMP.
  1.2978 +        case URX_JMP:
  1.2979 +            {
  1.2980 +                int32_t  jmpDest = URX_VAL(op);
  1.2981 +                if (jmpDest < loc) {
  1.2982 +                    // Loop of some kind.  Can safely ignore, the worst that will happen
  1.2983 +                    //  is that we understate the true minimum length
  1.2984 +                    currentLen = forwardedLength.elementAti(loc+1);
  1.2985 +                } else {
  1.2986 +                    // Forward jump.  Propagate the current min length to the target loc of the jump.
  1.2987 +                    U_ASSERT(jmpDest <= end+1);
  1.2988 +                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
  1.2989 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.2990 +                    }
  1.2991 +                }
  1.2992 +            }
  1.2993 +            break;
  1.2994 +
  1.2995 +        case URX_BACKTRACK:
  1.2996 +            {
  1.2997 +                // Back-tracks are kind of like a branch, except that the min length was
  1.2998 +                //   propagated already, by the state save.
  1.2999 +                currentLen = forwardedLength.elementAti(loc+1);
  1.3000 +            }
  1.3001 +            break;
  1.3002 +
  1.3003 +
  1.3004 +        case URX_STATE_SAVE:
  1.3005 +            {
  1.3006 +                // State Save, for forward jumps, propagate the current minimum.
  1.3007 +                //             of the state save.
  1.3008 +                int32_t  jmpDest = URX_VAL(op);
  1.3009 +                if (jmpDest > loc) {
  1.3010 +                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
  1.3011 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.3012 +                    }
  1.3013 +                }
  1.3014 +            }
  1.3015 +            break;
  1.3016 +
  1.3017 +
  1.3018 +        case URX_STRING:
  1.3019 +            {
  1.3020 +                loc++;
  1.3021 +                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3022 +                currentLen += URX_VAL(stringLenOp);
  1.3023 +            }
  1.3024 +            break;
  1.3025 +
  1.3026 +
  1.3027 +        case URX_STRING_I:
  1.3028 +            {
  1.3029 +                loc++;
  1.3030 +                // TODO: with full case folding, matching input text may be shorter than
  1.3031 +                //       the string we have here.  More smarts could put some bounds on it.
  1.3032 +                //       Assume a min length of one for now.  A min length of zero causes
  1.3033 +                //        optimization failures for a pattern like "string"+
  1.3034 +                // currentLen += URX_VAL(stringLenOp);
  1.3035 +                currentLen += 1;
  1.3036 +            }
  1.3037 +            break;
  1.3038 +
  1.3039 +        case URX_CTR_INIT:
  1.3040 +        case URX_CTR_INIT_NG:
  1.3041 +            {
  1.3042 +                // Loop Init Ops.
  1.3043 +                //   If the min loop count == 0
  1.3044 +                //      move loc forwards to the end of the loop, skipping over the body.
  1.3045 +                //   If the min count is > 0,
  1.3046 +                //      continue normal processing of the body of the loop.
  1.3047 +                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
  1.3048 +                        loopEndLoc   = URX_VAL(loopEndLoc);
  1.3049 +                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
  1.3050 +                if (minLoopCount == 0) {
  1.3051 +                    loc = loopEndLoc;
  1.3052 +                } else {
  1.3053 +                    loc+=3;  // Skips over operands of CTR_INIT
  1.3054 +                }
  1.3055 +            }
  1.3056 +            break;
  1.3057 +
  1.3058 +
  1.3059 +        case URX_CTR_LOOP:
  1.3060 +        case URX_CTR_LOOP_NG:
  1.3061 +            // Loop ops.
  1.3062 +            //  The jump is conditional, backwards only.
  1.3063 +            break;
  1.3064 +
  1.3065 +        case URX_LOOP_SR_I:
  1.3066 +        case URX_LOOP_DOT_I:
  1.3067 +        case URX_LOOP_C:
  1.3068 +            // More loop ops.  These state-save to themselves.
  1.3069 +            //   don't change the minimum match - could match nothing at all.
  1.3070 +            break;
  1.3071 +
  1.3072 +
  1.3073 +        case URX_LA_START:
  1.3074 +        case URX_LB_START:
  1.3075 +            {
  1.3076 +                // Look-around.  Scan forward until the matching look-ahead end,
  1.3077 +                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
  1.3078 +                //   it assumes that the look-ahead match might be zero-length.
  1.3079 +                //   TODO:  Positive lookahead could recursively do the block, then continue
  1.3080 +                //          with the longer of the block or the value coming in.  Ticket 6060
  1.3081 +                int32_t  depth = (opType == URX_LA_START? 2: 1);;
  1.3082 +                for (;;) {
  1.3083 +                    loc++;
  1.3084 +                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3085 +                    if (URX_TYPE(op) == URX_LA_START) {
  1.3086 +                        // The boilerplate for look-ahead includes two LA_END insturctions,
  1.3087 +                        //    Depth will be decremented by each one when it is seen.
  1.3088 +                        depth += 2;
  1.3089 +                    }
  1.3090 +                    if (URX_TYPE(op) == URX_LB_START) {
  1.3091 +                        depth++;
  1.3092 +                    }
  1.3093 +                    if (URX_TYPE(op) == URX_LA_END) {
  1.3094 +                        depth--;
  1.3095 +                        if (depth == 0) {
  1.3096 +                            break;
  1.3097 +                        }
  1.3098 +                    }
  1.3099 +                    if (URX_TYPE(op)==URX_LBN_END) {
  1.3100 +                        depth--;
  1.3101 +                        if (depth == 0) {
  1.3102 +                            break;
  1.3103 +                        }
  1.3104 +                    }
  1.3105 +                    if (URX_TYPE(op) == URX_STATE_SAVE) {
  1.3106 +                        // Need this because neg lookahead blocks will FAIL to outside
  1.3107 +                        //   of the block.
  1.3108 +                        int32_t  jmpDest = URX_VAL(op);
  1.3109 +                        if (jmpDest > loc) {
  1.3110 +                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
  1.3111 +                                forwardedLength.setElementAt(currentLen, jmpDest);
  1.3112 +                            }
  1.3113 +                        }
  1.3114 +                    }
  1.3115 +                    U_ASSERT(loc <= end);
  1.3116 +                }
  1.3117 +            }
  1.3118 +            break;
  1.3119 +
  1.3120 +        case URX_LA_END:
  1.3121 +        case URX_LB_CONT:
  1.3122 +        case URX_LB_END:
  1.3123 +        case URX_LBN_CONT:
  1.3124 +        case URX_LBN_END:
  1.3125 +            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
  1.3126 +            //   range being sized, which happens when measuring size of look-behind blocks.
  1.3127 +            break;
  1.3128 +
  1.3129 +        default:
  1.3130 +            U_ASSERT(FALSE);
  1.3131 +            }
  1.3132 +
  1.3133 +        }
  1.3134 +
  1.3135 +    // We have finished walking through the ops.  Check whether some forward jump
  1.3136 +    //   propagated a shorter length to location end+1.
  1.3137 +    if (forwardedLength.elementAti(end+1) < currentLen) {
  1.3138 +        currentLen = forwardedLength.elementAti(end+1);
  1.3139 +        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
  1.3140 +    }
  1.3141 +
  1.3142 +    return currentLen;
  1.3143 +}
  1.3144 +
  1.3145 +// Increment with overflow check.
  1.3146 +// val and delta will both be positive.
  1.3147 +
  1.3148 +static int32_t safeIncrement(int32_t val, int32_t delta) {
  1.3149 +    if (INT32_MAX - val > delta) {
  1.3150 +        return val + delta;
  1.3151 +    } else {
  1.3152 +        return INT32_MAX;
  1.3153 +    }
  1.3154 +}
  1.3155 +
  1.3156 +
  1.3157 +//------------------------------------------------------------------------------
  1.3158 +//
  1.3159 +//   maxMatchLength    Calculate the length of the longest string that could
  1.3160 +//                     match the specified pattern.
  1.3161 +//                     Length is in 16 bit code units, not code points.
  1.3162 +//
  1.3163 +//                     The calculated length may not be exact.  The returned
  1.3164 +//                     value may be longer than the actual maximum; it must
  1.3165 +//                     never be shorter.
  1.3166 +//
  1.3167 +//------------------------------------------------------------------------------
  1.3168 +int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
  1.3169 +    if (U_FAILURE(*fStatus)) {
  1.3170 +        return 0;
  1.3171 +    }
  1.3172 +    U_ASSERT(start <= end);
  1.3173 +    U_ASSERT(end < fRXPat->fCompiledPat->size());
  1.3174 +
  1.3175 +
  1.3176 +    int32_t    loc;
  1.3177 +    int32_t    op;
  1.3178 +    int32_t    opType;
  1.3179 +    int32_t    currentLen = 0;
  1.3180 +    UVector32  forwardedLength(end+1, *fStatus);
  1.3181 +    forwardedLength.setSize(end+1);
  1.3182 +
  1.3183 +    for (loc=start; loc<=end; loc++) {
  1.3184 +        forwardedLength.setElementAt(0, loc);
  1.3185 +    }
  1.3186 +
  1.3187 +    for (loc = start; loc<=end; loc++) {
  1.3188 +        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3189 +        opType = URX_TYPE(op);
  1.3190 +
  1.3191 +        // The loop is advancing linearly through the pattern.
  1.3192 +        // If the op we are now at was the destination of a branch in the pattern,
  1.3193 +        // and that path has a longer maximum length than the current accumulated value,
  1.3194 +        // replace the current accumulated value.
  1.3195 +        if (forwardedLength.elementAti(loc) > currentLen) {
  1.3196 +            currentLen = forwardedLength.elementAti(loc);
  1.3197 +        }
  1.3198 +
  1.3199 +        switch (opType) {
  1.3200 +            // Ops that don't change the total length matched
  1.3201 +        case URX_RESERVED_OP:
  1.3202 +        case URX_END:
  1.3203 +        case URX_STRING_LEN:
  1.3204 +        case URX_NOP:
  1.3205 +        case URX_START_CAPTURE:
  1.3206 +        case URX_END_CAPTURE:
  1.3207 +        case URX_BACKSLASH_B:
  1.3208 +        case URX_BACKSLASH_BU:
  1.3209 +        case URX_BACKSLASH_G:
  1.3210 +        case URX_BACKSLASH_Z:
  1.3211 +        case URX_CARET:
  1.3212 +        case URX_DOLLAR:
  1.3213 +        case URX_DOLLAR_M:
  1.3214 +        case URX_DOLLAR_D:
  1.3215 +        case URX_DOLLAR_MD:
  1.3216 +        case URX_RELOC_OPRND:
  1.3217 +        case URX_STO_INP_LOC:
  1.3218 +        case URX_CARET_M:
  1.3219 +        case URX_CARET_M_UNIX:
  1.3220 +
  1.3221 +        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
  1.3222 +        case URX_LD_SP:
  1.3223 +
  1.3224 +        case URX_LB_END:
  1.3225 +        case URX_LB_CONT:
  1.3226 +        case URX_LBN_CONT:
  1.3227 +        case URX_LBN_END:
  1.3228 +            break;
  1.3229 +
  1.3230 +
  1.3231 +            // Ops that increase that cause an unbounded increase in the length
  1.3232 +            //   of a matched string, or that increase it a hard to characterize way.
  1.3233 +            //   Call the max length unbounded, and stop further checking.
  1.3234 +        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
  1.3235 +        case URX_BACKREF_I:
  1.3236 +        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
  1.3237 +            currentLen = INT32_MAX;
  1.3238 +            break;
  1.3239 +
  1.3240 +
  1.3241 +            // Ops that match a max of one character (possibly two 16 bit code units.)
  1.3242 +            //
  1.3243 +        case URX_STATIC_SETREF:
  1.3244 +        case URX_STAT_SETREF_N:
  1.3245 +        case URX_SETREF:
  1.3246 +        case URX_BACKSLASH_D:
  1.3247 +        case URX_ONECHAR_I:
  1.3248 +        case URX_DOTANY_ALL:
  1.3249 +        case URX_DOTANY:
  1.3250 +        case URX_DOTANY_UNIX:
  1.3251 +            currentLen = safeIncrement(currentLen, 2);
  1.3252 +            break;
  1.3253 +
  1.3254 +            // Single literal character.  Increase current max length by one or two,
  1.3255 +            //       depending on whether the char is in the supplementary range.
  1.3256 +        case URX_ONECHAR:
  1.3257 +            currentLen = safeIncrement(currentLen, 1);
  1.3258 +            if (URX_VAL(op) > 0x10000) {
  1.3259 +                currentLen = safeIncrement(currentLen, 1);
  1.3260 +            }
  1.3261 +            break;
  1.3262 +
  1.3263 +            // Jumps.
  1.3264 +            //
  1.3265 +        case URX_JMP:
  1.3266 +        case URX_JMPX:
  1.3267 +        case URX_JMP_SAV:
  1.3268 +        case URX_JMP_SAV_X:
  1.3269 +            {
  1.3270 +                int32_t  jmpDest = URX_VAL(op);
  1.3271 +                if (jmpDest < loc) {
  1.3272 +                    // Loop of some kind.  Max match length is unbounded.
  1.3273 +                    currentLen = INT32_MAX;
  1.3274 +                } else {
  1.3275 +                    // Forward jump.  Propagate the current min length to the target loc of the jump.
  1.3276 +                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
  1.3277 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.3278 +                    }
  1.3279 +                    currentLen = 0;
  1.3280 +                }
  1.3281 +            }
  1.3282 +            break;
  1.3283 +
  1.3284 +        case URX_BACKTRACK:
  1.3285 +            // back-tracks are kind of like a branch, except that the max length was
  1.3286 +            //   propagated already, by the state save.
  1.3287 +            currentLen = forwardedLength.elementAti(loc+1);
  1.3288 +            break;
  1.3289 +
  1.3290 +
  1.3291 +        case URX_STATE_SAVE:
  1.3292 +            {
  1.3293 +                // State Save, for forward jumps, propagate the current minimum.
  1.3294 +                //               of the state save.
  1.3295 +                //             For backwards jumps, they create a loop, maximum
  1.3296 +                //               match length is unbounded.
  1.3297 +                int32_t  jmpDest = URX_VAL(op);
  1.3298 +                if (jmpDest > loc) {
  1.3299 +                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
  1.3300 +                        forwardedLength.setElementAt(currentLen, jmpDest);
  1.3301 +                    }
  1.3302 +                } else {
  1.3303 +                    currentLen = INT32_MAX;
  1.3304 +                }
  1.3305 +            }
  1.3306 +            break;
  1.3307 +
  1.3308 +
  1.3309 +
  1.3310 +
  1.3311 +        case URX_STRING:
  1.3312 +            {
  1.3313 +                loc++;
  1.3314 +                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3315 +                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
  1.3316 +                break;
  1.3317 +            }
  1.3318 +
  1.3319 +        case URX_STRING_I:
  1.3320 +            // TODO:  This code assumes that any user string that matches will be no longer
  1.3321 +            //        than our compiled string, with case insensitive matching.
  1.3322 +            //        Our compiled string has been case-folded already.
  1.3323 +            //
  1.3324 +            //        Any matching user string will have no more code points than our
  1.3325 +            //        compiled (folded) string.  Folding may add code points, but
  1.3326 +            //        not remove them.
  1.3327 +            //
  1.3328 +            //        There is a potential problem if a supplemental code point 
  1.3329 +            //        case-folds to a BMP code point.  In this case our compiled string
  1.3330 +            //        could be shorter (in code units) than a matching user string.
  1.3331 +            //
  1.3332 +            //        At this time (Unicode 6.1) there are no such characters, and this case
  1.3333 +            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
  1.3334 +            //        any problematic characters are added to Unicode.
  1.3335 +            //
  1.3336 +            //        If this happens, we can make a set of the BMP chars that the
  1.3337 +            //        troublesome supplementals fold to, scan our string, and bump the
  1.3338 +            //        currentLen one extra for each that is found.
  1.3339 +            //
  1.3340 +            {
  1.3341 +                loc++;
  1.3342 +                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3343 +                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
  1.3344 +            }
  1.3345 +            break;
  1.3346 +
  1.3347 +        case URX_CTR_INIT:
  1.3348 +        case URX_CTR_INIT_NG:
  1.3349 +            // For Loops, recursively call this function on the pattern for the loop body,
  1.3350 +            //   then multiply the result by the maximum loop count.
  1.3351 +            {
  1.3352 +                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
  1.3353 +                if (loopEndLoc == loc+4) {
  1.3354 +                    // Loop has an empty body. No affect on max match length.
  1.3355 +                    // Continue processing with code after the loop end.
  1.3356 +                    loc = loopEndLoc;
  1.3357 +                    break;
  1.3358 +                }
  1.3359 +                
  1.3360 +                int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
  1.3361 +                if (maxLoopCount == -1) {
  1.3362 +                    // Unbounded Loop. No upper bound on match length.
  1.3363 +                    currentLen = INT32_MAX;
  1.3364 +                    break;
  1.3365 +                }
  1.3366 +
  1.3367 +                U_ASSERT(loopEndLoc >= loc+4);
  1.3368 +                int32_t  blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
  1.3369 +                if (blockLen == INT32_MAX) {
  1.3370 +                    currentLen = blockLen;
  1.3371 +                    break;
  1.3372 +                }
  1.3373 +                currentLen += blockLen * maxLoopCount;
  1.3374 +                loc = loopEndLoc;
  1.3375 +                break;
  1.3376 +            }
  1.3377 +
  1.3378 +        case URX_CTR_LOOP:
  1.3379 +        case URX_CTR_LOOP_NG:
  1.3380 +            // These opcodes will be skipped over by code for URX_CRT_INIT.
  1.3381 +            // We shouldn't encounter them here.
  1.3382 +            U_ASSERT(FALSE);
  1.3383 +            break;
  1.3384 +
  1.3385 +        case URX_LOOP_SR_I:
  1.3386 +        case URX_LOOP_DOT_I:
  1.3387 +        case URX_LOOP_C:
  1.3388 +            // For anything to do with loops, make the match length unbounded.
  1.3389 +            currentLen = INT32_MAX;
  1.3390 +            break;
  1.3391 +
  1.3392 +
  1.3393 +
  1.3394 +        case URX_LA_START:
  1.3395 +        case URX_LA_END:
  1.3396 +            // Look-ahead.  Just ignore, treat the look-ahead block as if
  1.3397 +            // it were normal pattern.  Gives a too-long match length,
  1.3398 +            //  but good enough for now.
  1.3399 +            break;
  1.3400 +
  1.3401 +            // End of look-ahead ops should always be consumed by the processing at
  1.3402 +            //  the URX_LA_START op.
  1.3403 +            // U_ASSERT(FALSE);
  1.3404 +            // break;
  1.3405 +
  1.3406 +        case URX_LB_START:
  1.3407 +            {
  1.3408 +                // Look-behind.  Scan forward until the matching look-around end,
  1.3409 +                //   without processing the look-behind block.
  1.3410 +                int32_t  depth = 0;
  1.3411 +                for (;;) {
  1.3412 +                    loc++;
  1.3413 +                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3414 +                    if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
  1.3415 +                        depth++;
  1.3416 +                    }
  1.3417 +                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
  1.3418 +                        if (depth == 0) {
  1.3419 +                            break;
  1.3420 +                        }
  1.3421 +                        depth--;
  1.3422 +                    }
  1.3423 +                    U_ASSERT(loc < end);
  1.3424 +                }
  1.3425 +            }
  1.3426 +            break;
  1.3427 +
  1.3428 +        default:
  1.3429 +            U_ASSERT(FALSE);
  1.3430 +        }
  1.3431 +
  1.3432 +
  1.3433 +        if (currentLen == INT32_MAX) {
  1.3434 +            //  The maximum length is unbounded.
  1.3435 +            //  Stop further processing of the pattern.
  1.3436 +            break;
  1.3437 +        }
  1.3438 +
  1.3439 +    }
  1.3440 +    return currentLen;
  1.3441 +
  1.3442 +}
  1.3443 +
  1.3444 +
  1.3445 +//------------------------------------------------------------------------------
  1.3446 +//
  1.3447 +//   stripNOPs    Remove any NOP operations from the compiled pattern code.
  1.3448 +//                Extra NOPs are inserted for some constructs during the initial
  1.3449 +//                code generation to provide locations that may be patched later.
  1.3450 +//                Many end up unneeded, and are removed by this function.
  1.3451 +//
  1.3452 +//                In order to minimize the number of passes through the pattern,
  1.3453 +//                back-reference fixup is also performed here (adjusting
  1.3454 +//                back-reference operands to point to the correct frame offsets).
  1.3455 +//
  1.3456 +//------------------------------------------------------------------------------
  1.3457 +void RegexCompile::stripNOPs() {
  1.3458 +
  1.3459 +    if (U_FAILURE(*fStatus)) {
  1.3460 +        return;
  1.3461 +    }
  1.3462 +
  1.3463 +    int32_t    end = fRXPat->fCompiledPat->size();
  1.3464 +    UVector32  deltas(end, *fStatus);
  1.3465 +
  1.3466 +    // Make a first pass over the code, computing the amount that things
  1.3467 +    //   will be offset at each location in the original code.
  1.3468 +    int32_t   loc;
  1.3469 +    int32_t   d = 0;
  1.3470 +    for (loc=0; loc<end; loc++) {
  1.3471 +        deltas.addElement(d, *fStatus);
  1.3472 +        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
  1.3473 +        if (URX_TYPE(op) == URX_NOP) {
  1.3474 +            d++;
  1.3475 +        }
  1.3476 +    }
  1.3477 +    
  1.3478 +    UnicodeString caseStringBuffer;
  1.3479 +
  1.3480 +    // Make a second pass over the code, removing the NOPs by moving following
  1.3481 +    //  code up, and patching operands that refer to code locations that
  1.3482 +    //  are being moved.  The array of offsets from the first step is used
  1.3483 +    //  to compute the new operand values.
  1.3484 +    int32_t src;
  1.3485 +    int32_t dst = 0;
  1.3486 +    for (src=0; src<end; src++) {
  1.3487 +        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
  1.3488 +        int32_t opType = URX_TYPE(op);
  1.3489 +        switch (opType) {
  1.3490 +        case URX_NOP:
  1.3491 +            break;
  1.3492 +
  1.3493 +        case URX_STATE_SAVE:
  1.3494 +        case URX_JMP:
  1.3495 +        case URX_CTR_LOOP:
  1.3496 +        case URX_CTR_LOOP_NG:
  1.3497 +        case URX_RELOC_OPRND:
  1.3498 +        case URX_JMPX:
  1.3499 +        case URX_JMP_SAV:
  1.3500 +        case URX_JMP_SAV_X:
  1.3501 +            // These are instructions with operands that refer to code locations.
  1.3502 +            {
  1.3503 +                int32_t  operandAddress = URX_VAL(op);
  1.3504 +                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
  1.3505 +                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
  1.3506 +                op = URX_BUILD(opType, fixedOperandAddress);
  1.3507 +                fRXPat->fCompiledPat->setElementAt(op, dst);
  1.3508 +                dst++;
  1.3509 +                break;
  1.3510 +            }
  1.3511 +
  1.3512 +        case URX_BACKREF:
  1.3513 +        case URX_BACKREF_I:
  1.3514 +            {
  1.3515 +                int32_t where = URX_VAL(op);
  1.3516 +                if (where > fRXPat->fGroupMap->size()) {
  1.3517 +                    error(U_REGEX_INVALID_BACK_REF);
  1.3518 +                    break;
  1.3519 +                }
  1.3520 +                where = fRXPat->fGroupMap->elementAti(where-1);
  1.3521 +                op    = URX_BUILD(opType, where);
  1.3522 +                fRXPat->fCompiledPat->setElementAt(op, dst);
  1.3523 +                dst++;
  1.3524 +                
  1.3525 +                fRXPat->fNeedsAltInput = TRUE;
  1.3526 +                break;
  1.3527 +            }
  1.3528 +        case URX_RESERVED_OP:
  1.3529 +        case URX_RESERVED_OP_N:
  1.3530 +        case URX_BACKTRACK:
  1.3531 +        case URX_END:
  1.3532 +        case URX_ONECHAR:
  1.3533 +        case URX_STRING:
  1.3534 +        case URX_STRING_LEN:
  1.3535 +        case URX_START_CAPTURE:
  1.3536 +        case URX_END_CAPTURE:
  1.3537 +        case URX_STATIC_SETREF:
  1.3538 +        case URX_STAT_SETREF_N:
  1.3539 +        case URX_SETREF:
  1.3540 +        case URX_DOTANY:
  1.3541 +        case URX_FAIL:
  1.3542 +        case URX_BACKSLASH_B:
  1.3543 +        case URX_BACKSLASH_BU:
  1.3544 +        case URX_BACKSLASH_G:
  1.3545 +        case URX_BACKSLASH_X:
  1.3546 +        case URX_BACKSLASH_Z:
  1.3547 +        case URX_DOTANY_ALL:
  1.3548 +        case URX_BACKSLASH_D:
  1.3549 +        case URX_CARET:
  1.3550 +        case URX_DOLLAR:
  1.3551 +        case URX_CTR_INIT:
  1.3552 +        case URX_CTR_INIT_NG:
  1.3553 +        case URX_DOTANY_UNIX:
  1.3554 +        case URX_STO_SP:
  1.3555 +        case URX_LD_SP:
  1.3556 +        case URX_STO_INP_LOC:
  1.3557 +        case URX_LA_START:
  1.3558 +        case URX_LA_END:
  1.3559 +        case URX_ONECHAR_I:
  1.3560 +        case URX_STRING_I:
  1.3561 +        case URX_DOLLAR_M:
  1.3562 +        case URX_CARET_M:
  1.3563 +        case URX_CARET_M_UNIX:
  1.3564 +        case URX_LB_START:
  1.3565 +        case URX_LB_CONT:
  1.3566 +        case URX_LB_END:
  1.3567 +        case URX_LBN_CONT:
  1.3568 +        case URX_LBN_END:
  1.3569 +        case URX_LOOP_SR_I:
  1.3570 +        case URX_LOOP_DOT_I:
  1.3571 +        case URX_LOOP_C:
  1.3572 +        case URX_DOLLAR_D:
  1.3573 +        case URX_DOLLAR_MD:
  1.3574 +            // These instructions are unaltered by the relocation.
  1.3575 +            fRXPat->fCompiledPat->setElementAt(op, dst);
  1.3576 +            dst++;
  1.3577 +            break;
  1.3578 +
  1.3579 +        default:
  1.3580 +            // Some op is unaccounted for.
  1.3581 +            U_ASSERT(FALSE);
  1.3582 +            error(U_REGEX_INTERNAL_ERROR);
  1.3583 +        }
  1.3584 +    }
  1.3585 +
  1.3586 +    fRXPat->fCompiledPat->setSize(dst);
  1.3587 +}
  1.3588 +
  1.3589 +
  1.3590 +
  1.3591 +
  1.3592 +//------------------------------------------------------------------------------
  1.3593 +//
  1.3594 +//  Error         Report a rule parse error.
  1.3595 +//                Only report it if no previous error has been recorded.
  1.3596 +//
  1.3597 +//------------------------------------------------------------------------------
  1.3598 +void RegexCompile::error(UErrorCode e) {
  1.3599 +    if (U_SUCCESS(*fStatus)) {
  1.3600 +        *fStatus = e;
  1.3601 +        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
  1.3602 +        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
  1.3603 +        // int64_t. If the values of the latter are out of range for the former,
  1.3604 +        // set them to the appropriate "field not supported" values.
  1.3605 +        if (fLineNum > 0x7FFFFFFF) {
  1.3606 +            fParseErr->line   = 0;
  1.3607 +            fParseErr->offset = -1;
  1.3608 +        } else if (fCharNum > 0x7FFFFFFF) {
  1.3609 +            fParseErr->line   = (int32_t)fLineNum;
  1.3610 +            fParseErr->offset = -1;
  1.3611 +        } else {
  1.3612 +            fParseErr->line   = (int32_t)fLineNum;
  1.3613 +            fParseErr->offset = (int32_t)fCharNum;
  1.3614 +        }
  1.3615 +        
  1.3616 +        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
  1.3617 +
  1.3618 +        // Fill in the context.
  1.3619 +        //   Note: extractBetween() pins supplied indicies to the string bounds.
  1.3620 +        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
  1.3621 +        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
  1.3622 +        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
  1.3623 +        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
  1.3624 +    }
  1.3625 +}
  1.3626 +
  1.3627 +
  1.3628 +//
  1.3629 +//  Assorted Unicode character constants.
  1.3630 +//     Numeric because there is no portable way to enter them as literals.
  1.3631 +//     (Think EBCDIC).
  1.3632 +//
  1.3633 +static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
  1.3634 +static const UChar      chLF        = 0x0a;      // Line Feed
  1.3635 +static const UChar      chPound     = 0x23;      // '#', introduces a comment.
  1.3636 +static const UChar      chDigit0    = 0x30;      // '0'
  1.3637 +static const UChar      chDigit7    = 0x37;      // '9'
  1.3638 +static const UChar      chColon     = 0x3A;      // ':'
  1.3639 +static const UChar      chE         = 0x45;      // 'E'
  1.3640 +static const UChar      chQ         = 0x51;      // 'Q'
  1.3641 +//static const UChar      chN         = 0x4E;      // 'N'
  1.3642 +static const UChar      chP         = 0x50;      // 'P'
  1.3643 +static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
  1.3644 +//static const UChar      chLBracket  = 0x5b;      // '['
  1.3645 +static const UChar      chRBracket  = 0x5d;      // ']'
  1.3646 +static const UChar      chUp        = 0x5e;      // '^'
  1.3647 +static const UChar      chLowerP    = 0x70;
  1.3648 +static const UChar      chLBrace    = 0x7b;      // '{'
  1.3649 +static const UChar      chRBrace    = 0x7d;      // '}'
  1.3650 +static const UChar      chNEL       = 0x85;      //    NEL newline variant
  1.3651 +static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
  1.3652 +
  1.3653 +
  1.3654 +//------------------------------------------------------------------------------
  1.3655 +//
  1.3656 +//  nextCharLL    Low Level Next Char from the regex pattern.
  1.3657 +//                Get a char from the string, keep track of input position
  1.3658 +//                     for error reporting.
  1.3659 +//
  1.3660 +//------------------------------------------------------------------------------
  1.3661 +UChar32  RegexCompile::nextCharLL() {
  1.3662 +    UChar32       ch;
  1.3663 +
  1.3664 +    if (fPeekChar != -1) {
  1.3665 +        ch = fPeekChar;
  1.3666 +        fPeekChar = -1;
  1.3667 +        return ch;
  1.3668 +    }
  1.3669 +    
  1.3670 +    // assume we're already in the right place
  1.3671 +    ch = UTEXT_NEXT32(fRXPat->fPattern);
  1.3672 +    if (ch == U_SENTINEL) {
  1.3673 +        return ch;
  1.3674 +    }
  1.3675 +
  1.3676 +    if (ch == chCR ||
  1.3677 +        ch == chNEL ||
  1.3678 +        ch == chLS   ||
  1.3679 +        (ch == chLF && fLastChar != chCR)) {
  1.3680 +        // Character is starting a new line.  Bump up the line number, and
  1.3681 +        //  reset the column to 0.
  1.3682 +        fLineNum++;
  1.3683 +        fCharNum=0;
  1.3684 +    }
  1.3685 +    else {
  1.3686 +        // Character is not starting a new line.  Except in the case of a
  1.3687 +        //   LF following a CR, increment the column position.
  1.3688 +        if (ch != chLF) {
  1.3689 +            fCharNum++;
  1.3690 +        }
  1.3691 +    }
  1.3692 +    fLastChar = ch;
  1.3693 +    return ch;
  1.3694 +}
  1.3695 +
  1.3696 +//------------------------------------------------------------------------------
  1.3697 +//
  1.3698 +//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
  1.3699 +//                 character without actually getting it.
  1.3700 +//
  1.3701 +//------------------------------------------------------------------------------
  1.3702 +UChar32  RegexCompile::peekCharLL() {
  1.3703 +    if (fPeekChar == -1) {
  1.3704 +        fPeekChar = nextCharLL();
  1.3705 +    }
  1.3706 +    return fPeekChar;
  1.3707 +}
  1.3708 +
  1.3709 +
  1.3710 +//------------------------------------------------------------------------------
  1.3711 +//
  1.3712 +//   nextChar     for pattern scanning.  At this level, we handle stripping
  1.3713 +//                out comments and processing some backslash character escapes.
  1.3714 +//                The rest of the pattern grammar is handled at the next level up.
  1.3715 +//
  1.3716 +//------------------------------------------------------------------------------
  1.3717 +void RegexCompile::nextChar(RegexPatternChar &c) {
  1.3718 +
  1.3719 +    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
  1.3720 +    c.fChar    = nextCharLL();
  1.3721 +    c.fQuoted  = FALSE;
  1.3722 +
  1.3723 +    if (fQuoteMode) {
  1.3724 +        c.fQuoted = TRUE;
  1.3725 +        if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 
  1.3726 +            c.fChar == (UChar32)-1) {
  1.3727 +            fQuoteMode = FALSE;  //  Exit quote mode,
  1.3728 +            nextCharLL();        // discard the E
  1.3729 +            nextChar(c);         // recurse to get the real next char
  1.3730 +        }
  1.3731 +    }
  1.3732 +    else if (fInBackslashQuote) {
  1.3733 +        // The current character immediately follows a '\'
  1.3734 +        // Don't check for any further escapes, just return it as-is.
  1.3735 +        // Don't set c.fQuoted, because that would prevent the state machine from
  1.3736 +        //    dispatching on the character.
  1.3737 +        fInBackslashQuote = FALSE;
  1.3738 +    }
  1.3739 +    else
  1.3740 +    {
  1.3741 +        // We are not in a \Q quoted region \E of the source.
  1.3742 +        //
  1.3743 +        if (fModeFlags & UREGEX_COMMENTS) {
  1.3744 +            //
  1.3745 +            // We are in free-spacing and comments mode.
  1.3746 +            //  Scan through any white space and comments, until we
  1.3747 +            //  reach a significant character or the end of inut.
  1.3748 +            for (;;) {
  1.3749 +                if (c.fChar == (UChar32)-1) {
  1.3750 +                    break;     // End of Input
  1.3751 +                }
  1.3752 +                if  (c.fChar == chPound && fEOLComments == TRUE) {
  1.3753 +                    // Start of a comment.  Consume the rest of it, until EOF or a new line
  1.3754 +                    for (;;) {
  1.3755 +                        c.fChar = nextCharLL();
  1.3756 +                        if (c.fChar == (UChar32)-1 ||  // EOF
  1.3757 +                            c.fChar == chCR        ||
  1.3758 +                            c.fChar == chLF        ||
  1.3759 +                            c.fChar == chNEL       ||
  1.3760 +                            c.fChar == chLS)       {
  1.3761 +                            break;
  1.3762 +                        }
  1.3763 +                    }
  1.3764 +                }
  1.3765 +                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
  1.3766 +                if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
  1.3767 +                    break;
  1.3768 +                }
  1.3769 +                c.fChar = nextCharLL();
  1.3770 +            }
  1.3771 +        }
  1.3772 +
  1.3773 +        //
  1.3774 +        //  check for backslash escaped characters.
  1.3775 +        //
  1.3776 +        if (c.fChar == chBackSlash) {
  1.3777 +            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
  1.3778 +            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
  1.3779 +                //
  1.3780 +                // A '\' sequence that is handled by ICU's standard unescapeAt function.
  1.3781 +                //   Includes \uxxxx, \n, \r, many others.
  1.3782 +                //   Return the single equivalent character.
  1.3783 +                //
  1.3784 +                nextCharLL();                 // get & discard the peeked char.
  1.3785 +                c.fQuoted = TRUE;
  1.3786 +                
  1.3787 +                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
  1.3788 +                    int32_t endIndex = (int32_t)pos;
  1.3789 +                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
  1.3790 +                    
  1.3791 +                    if (endIndex == pos) {
  1.3792 +                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.3793 +                    }
  1.3794 +                    fCharNum += endIndex - pos;
  1.3795 +                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
  1.3796 +                } else {
  1.3797 +                    int32_t offset = 0;
  1.3798 +                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
  1.3799 +                    
  1.3800 +                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
  1.3801 +                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
  1.3802 +
  1.3803 +                    if (offset == 0) {
  1.3804 +                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.3805 +                    } else if (context.lastOffset == offset) {
  1.3806 +                        UTEXT_PREVIOUS32(fRXPat->fPattern);
  1.3807 +                    } else if (context.lastOffset != offset-1) {
  1.3808 +                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
  1.3809 +                    }
  1.3810 +                    fCharNum += offset;
  1.3811 +                }
  1.3812 +            }
  1.3813 +            else if (peekCharLL() == chDigit0) {
  1.3814 +                //  Octal Escape, using Java Regexp Conventions
  1.3815 +                //    which are \0 followed by 1-3 octal digits.
  1.3816 +                //    Different from ICU Unescape handling of Octal, which does not
  1.3817 +                //    require the leading 0.
  1.3818 +                //  Java also has the convention of only consuming 2 octal digits if
  1.3819 +                //    the three digit number would be > 0xff
  1.3820 +                //
  1.3821 +                c.fChar = 0;
  1.3822 +                nextCharLL();    // Consume the initial 0.
  1.3823 +                int index;
  1.3824 +                for (index=0; index<3; index++) {
  1.3825 +                    int32_t ch = peekCharLL();
  1.3826 +                    if (ch<chDigit0 || ch>chDigit7) {
  1.3827 +                        if (index==0) {
  1.3828 +                           // \0 is not followed by any octal digits.
  1.3829 +                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
  1.3830 +                        }
  1.3831 +                        break;
  1.3832 +                    }
  1.3833 +                    c.fChar <<= 3;
  1.3834 +                    c.fChar += ch&7;
  1.3835 +                    if (c.fChar <= 255) {
  1.3836 +                        nextCharLL();
  1.3837 +                    } else {
  1.3838 +                        // The last digit made the number too big.  Forget we saw it.
  1.3839 +                        c.fChar >>= 3;
  1.3840 +                    }
  1.3841 +                }
  1.3842 +                c.fQuoted = TRUE; 
  1.3843 +            } 
  1.3844 +            else if (peekCharLL() == chQ) {
  1.3845 +                //  "\Q"  enter quote mode, which will continue until "\E"
  1.3846 +                fQuoteMode = TRUE;
  1.3847 +                nextCharLL();       // discard the 'Q'.
  1.3848 +                nextChar(c);        // recurse to get the real next char.
  1.3849 +            }
  1.3850 +            else
  1.3851 +            {
  1.3852 +                // We are in a '\' escape that will be handled by the state table scanner.
  1.3853 +                // Just return the backslash, but remember that the following char is to
  1.3854 +                //  be taken literally.
  1.3855 +                fInBackslashQuote = TRUE;
  1.3856 +            }
  1.3857 +        }
  1.3858 +    }
  1.3859 +
  1.3860 +    // re-enable # to end-of-line comments, in case they were disabled.
  1.3861 +    // They are disabled by the parser upon seeing '(?', but this lasts for
  1.3862 +    //  the fetching of the next character only.
  1.3863 +    fEOLComments = TRUE;
  1.3864 +
  1.3865 +    // putc(c.fChar, stdout);
  1.3866 +}
  1.3867 +
  1.3868 +
  1.3869 +
  1.3870 +//------------------------------------------------------------------------------
  1.3871 +//
  1.3872 +//  scanNamedChar
  1.3873 + //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
  1.3874 +//
  1.3875 +//             The scan position will be at the 'N'.  On return
  1.3876 +//             the scan position should be just after the '}'
  1.3877 +//
  1.3878 +//             Return the UChar32
  1.3879 +//
  1.3880 +//------------------------------------------------------------------------------
  1.3881 +UChar32  RegexCompile::scanNamedChar() {
  1.3882 +    if (U_FAILURE(*fStatus)) {
  1.3883 +        return 0;
  1.3884 +    }
  1.3885 +
  1.3886 +    nextChar(fC);
  1.3887 +    if (fC.fChar != chLBrace) {
  1.3888 +        error(U_REGEX_PROPERTY_SYNTAX);
  1.3889 +        return 0;
  1.3890 +    }
  1.3891 +    
  1.3892 +    UnicodeString  charName;
  1.3893 +    for (;;) {
  1.3894 +        nextChar(fC);
  1.3895 +        if (fC.fChar == chRBrace) {
  1.3896 +            break;
  1.3897 +        }
  1.3898 +        if (fC.fChar == -1) {
  1.3899 +            error(U_REGEX_PROPERTY_SYNTAX);
  1.3900 +            return 0;
  1.3901 +        }
  1.3902 +        charName.append(fC.fChar);
  1.3903 +    }
  1.3904 +    
  1.3905 +    char name[100];
  1.3906 +    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
  1.3907 +         (uint32_t)charName.length()>=sizeof(name)) {
  1.3908 +        // All Unicode character names have only invariant characters.
  1.3909 +        // The API to get a character, given a name, accepts only char *, forcing us to convert,
  1.3910 +        //   which requires this error check
  1.3911 +        error(U_REGEX_PROPERTY_SYNTAX);
  1.3912 +        return 0;
  1.3913 +    }
  1.3914 +    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
  1.3915 +
  1.3916 +    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
  1.3917 +    if (U_FAILURE(*fStatus)) {
  1.3918 +        error(U_REGEX_PROPERTY_SYNTAX);
  1.3919 +    }
  1.3920 +
  1.3921 +    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
  1.3922 +    return theChar;
  1.3923 +}
  1.3924 +
  1.3925 +//------------------------------------------------------------------------------
  1.3926 +//
  1.3927 +//  scanProp   Construct a UnicodeSet from the text at the current scan
  1.3928 +//             position, which will be of the form \p{whaterver}
  1.3929 +//
  1.3930 +//             The scan position will be at the 'p' or 'P'.  On return
  1.3931 +//             the scan position should be just after the '}'
  1.3932 +//
  1.3933 +//             Return a UnicodeSet, constructed from the \P pattern,
  1.3934 +//             or NULL if the pattern is invalid.
  1.3935 +//
  1.3936 +//------------------------------------------------------------------------------
  1.3937 +UnicodeSet *RegexCompile::scanProp() {
  1.3938 +    UnicodeSet    *uset = NULL;
  1.3939 +
  1.3940 +    if (U_FAILURE(*fStatus)) {
  1.3941 +        return NULL;
  1.3942 +    }
  1.3943 +    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
  1.3944 +    UBool negated = (fC.fChar == chP);
  1.3945 +
  1.3946 +    UnicodeString propertyName;
  1.3947 +    nextChar(fC);
  1.3948 +    if (fC.fChar != chLBrace) {
  1.3949 +        error(U_REGEX_PROPERTY_SYNTAX);
  1.3950 +        return NULL;
  1.3951 +    }
  1.3952 +    for (;;) {
  1.3953 +        nextChar(fC);
  1.3954 +        if (fC.fChar == chRBrace) {
  1.3955 +            break;
  1.3956 +        }
  1.3957 +        if (fC.fChar == -1) {
  1.3958 +            // Hit the end of the input string without finding the closing '}'
  1.3959 +            error(U_REGEX_PROPERTY_SYNTAX);
  1.3960 +            return NULL;
  1.3961 +        }
  1.3962 +        propertyName.append(fC.fChar);
  1.3963 +    }
  1.3964 +    uset = createSetForProperty(propertyName, negated);
  1.3965 +    nextChar(fC);    // Move input scan to position following the closing '}'
  1.3966 +    return uset;
  1.3967 +}
  1.3968 +
  1.3969 +//------------------------------------------------------------------------------
  1.3970 +//
  1.3971 +//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
  1.3972 +//             position, which is expected be of the form [:property expression:]
  1.3973 +//
  1.3974 +//             The scan position will be at the opening ':'.  On return
  1.3975 +//             the scan position must be on the closing ']'
  1.3976 +//
  1.3977 +//             Return a UnicodeSet constructed from the pattern,
  1.3978 +//             or NULL if this is not a valid POSIX-style set expression.
  1.3979 +//             If not a property expression, restore the initial scan position
  1.3980 +//                (to the opening ':')
  1.3981 +//
  1.3982 +//               Note:  the opening '[:' is not sufficient to guarantee that
  1.3983 +//                      this is a [:property:] expression.
  1.3984 +//                      [:'+=,] is a perfectly good ordinary set expression that
  1.3985 +//                              happens to include ':' as one of its characters.
  1.3986 +//
  1.3987 +//------------------------------------------------------------------------------
  1.3988 +UnicodeSet *RegexCompile::scanPosixProp() {
  1.3989 +    UnicodeSet    *uset = NULL;
  1.3990 +
  1.3991 +    if (U_FAILURE(*fStatus)) {
  1.3992 +        return NULL;
  1.3993 +    }
  1.3994 +
  1.3995 +    U_ASSERT(fC.fChar == chColon);
  1.3996 +
  1.3997 +    // Save the scanner state.
  1.3998 +    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
  1.3999 +    int64_t     savedScanIndex        = fScanIndex;
  1.4000 +    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
  1.4001 +    UBool       savedQuoteMode        = fQuoteMode;
  1.4002 +    UBool       savedInBackslashQuote = fInBackslashQuote;
  1.4003 +    UBool       savedEOLComments      = fEOLComments;
  1.4004 +    int64_t     savedLineNum          = fLineNum;
  1.4005 +    int64_t     savedCharNum          = fCharNum;
  1.4006 +    UChar32     savedLastChar         = fLastChar;
  1.4007 +    UChar32     savedPeekChar         = fPeekChar;
  1.4008 +    RegexPatternChar savedfC          = fC;
  1.4009 +
  1.4010 +    // Scan for a closing ].   A little tricky because there are some perverse
  1.4011 +    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
  1.4012 +    //   ending on the second closing ]. 
  1.4013 +
  1.4014 +    UnicodeString propName;
  1.4015 +    UBool         negated  = FALSE;
  1.4016 +
  1.4017 +    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
  1.4018 +    nextChar(fC);
  1.4019 +    if (fC.fChar == chUp) {
  1.4020 +       negated = TRUE;
  1.4021 +       nextChar(fC);
  1.4022 +    }
  1.4023 +    
  1.4024 +    // Scan for the closing ":]", collecting the property name along the way.
  1.4025 +    UBool  sawPropSetTerminator = FALSE;
  1.4026 +    for (;;) {
  1.4027 +        propName.append(fC.fChar);
  1.4028 +        nextChar(fC);
  1.4029 +        if (fC.fQuoted || fC.fChar == -1) {
  1.4030 +            // Escaped characters or end of input - either says this isn't a [:Property:]
  1.4031 +            break;
  1.4032 +        }
  1.4033 +        if (fC.fChar == chColon) {
  1.4034 +            nextChar(fC);
  1.4035 +            if (fC.fChar == chRBracket) {
  1.4036 +                sawPropSetTerminator = TRUE;
  1.4037 +            }
  1.4038 +            break;
  1.4039 +        }
  1.4040 +    }
  1.4041 +    
  1.4042 +    if (sawPropSetTerminator) {
  1.4043 +        uset = createSetForProperty(propName, negated);
  1.4044 +    }
  1.4045 +    else
  1.4046 +    {
  1.4047 +        // No closing ":]".
  1.4048 +        //  Restore the original scan position.
  1.4049 +        //  The main scanner will retry the input as a normal set expression,
  1.4050 +        //    not a [:Property:] expression.
  1.4051 +        fScanIndex        = savedScanIndex;
  1.4052 +        fQuoteMode        = savedQuoteMode;
  1.4053 +        fInBackslashQuote = savedInBackslashQuote;
  1.4054 +        fEOLComments      = savedEOLComments;
  1.4055 +        fLineNum          = savedLineNum;
  1.4056 +        fCharNum          = savedCharNum;
  1.4057 +        fLastChar         = savedLastChar;
  1.4058 +        fPeekChar         = savedPeekChar;
  1.4059 +        fC                = savedfC;
  1.4060 +        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
  1.4061 +    }
  1.4062 +    return uset;
  1.4063 +}
  1.4064 +
  1.4065 +static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
  1.4066 +    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
  1.4067 +    addCategory(set, U_GC_CF_MASK, ec);
  1.4068 +}
  1.4069 +
  1.4070 +//
  1.4071 +//  Create a Unicode Set from a Unicode Property expression.
  1.4072 +//     This is common code underlying both \p{...} ane [:...:] expressions.
  1.4073 +//     Includes trying the Java "properties" that aren't supported as
  1.4074 +//     normal ICU UnicodeSet properties 
  1.4075 +//
  1.4076 +static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
  1.4077 +static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
  1.4078 +UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
  1.4079 +    UnicodeString   setExpr;
  1.4080 +    UnicodeSet      *set;
  1.4081 +    uint32_t        usetFlags = 0;
  1.4082 +    
  1.4083 +    if (U_FAILURE(*fStatus)) {
  1.4084 +        return NULL;
  1.4085 +    }
  1.4086 +
  1.4087 +    //
  1.4088 +    //  First try the property as we received it
  1.4089 +    //
  1.4090 +    if (negated) {
  1.4091 +        setExpr.append(negSetPrefix, -1);
  1.4092 +    } else {
  1.4093 +        setExpr.append(posSetPrefix, -1);
  1.4094 +    }
  1.4095 +    setExpr.append(propName);
  1.4096 +    setExpr.append(chRBrace);
  1.4097 +    setExpr.append(chRBracket);
  1.4098 +    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
  1.4099 +        usetFlags |= USET_CASE_INSENSITIVE;
  1.4100 +    }
  1.4101 +    set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
  1.4102 +    if (U_SUCCESS(*fStatus)) {
  1.4103 +       return set;
  1.4104 +    }
  1.4105 +    delete set;
  1.4106 +    set = NULL;
  1.4107 +    
  1.4108 +    //
  1.4109 +    //  The property as it was didn't work.
  1.4110 +
  1.4111 +    //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX 
  1.4112 +    //     or standard Java, but many other regular expression packages do recognize it.
  1.4113 +    
  1.4114 +    if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
  1.4115 +        *fStatus = U_ZERO_ERROR;
  1.4116 +        set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
  1.4117 +        if (set == NULL) {
  1.4118 +            *fStatus = U_MEMORY_ALLOCATION_ERROR;
  1.4119 +            return set;
  1.4120 +        }
  1.4121 +        if (negated) {
  1.4122 +            set->complement();
  1.4123 +        }
  1.4124 +        return set;
  1.4125 +    }
  1.4126 +
  1.4127 +
  1.4128 +    //    Do Java fixes -
  1.4129 +    //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
  1.4130 +    //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
  1.4131 +    //
  1.4132 +    //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
  1.4133 +    //                        is accepted by Java.  The property part of the name is compared 
  1.4134 +    //                        case-insenstively.  The spaces must be exactly as shown, either
  1.4135 +    //                        all there, or all omitted, with exactly one at each position
  1.4136 +    //                        if they are present.  From checking against JDK 1.6
  1.4137 +    //
  1.4138 +    //       This code should be removed when ICU properties support the Java  compatibility names
  1.4139 +    //          (ICU 4.0?)
  1.4140 +    //
  1.4141 +    UnicodeString mPropName = propName;
  1.4142 +    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
  1.4143 +        mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
  1.4144 +    }
  1.4145 +    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
  1.4146 +        mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
  1.4147 +        mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
  1.4148 +    }
  1.4149 +    else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
  1.4150 +        mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
  1.4151 +    }
  1.4152 +    
  1.4153 +    //    See if the property looks like a Java "InBlockName", which
  1.4154 +    //    we will recast as "Block=BlockName"
  1.4155 +    //
  1.4156 +    static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
  1.4157 +    static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
  1.4158 +    if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
  1.4159 +        setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
  1.4160 +        setExpr.append(BLOCK, -1);
  1.4161 +        setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
  1.4162 +        setExpr.append(chRBrace);
  1.4163 +        setExpr.append(chRBracket);
  1.4164 +        *fStatus = U_ZERO_ERROR;
  1.4165 +        set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
  1.4166 +        if (U_SUCCESS(*fStatus)) {
  1.4167 +            return set;
  1.4168 +        }
  1.4169 +        delete set;
  1.4170 +        set = NULL;
  1.4171 +    }
  1.4172 +
  1.4173 +    if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
  1.4174 +        propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
  1.4175 +    {
  1.4176 +        UErrorCode localStatus = U_ZERO_ERROR;
  1.4177 +        //setExpr.remove();
  1.4178 +        set = new UnicodeSet();
  1.4179 +        //
  1.4180 +        //  Try the various Java specific properties.
  1.4181 +        //   These all begin with "java"
  1.4182 +        //
  1.4183 +        if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
  1.4184 +            addCategory(set, U_GC_CN_MASK, localStatus);
  1.4185 +            set->complement();
  1.4186 +        }
  1.4187 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
  1.4188 +            addCategory(set, U_GC_ND_MASK, localStatus);
  1.4189 +        }
  1.4190 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
  1.4191 +            addIdentifierIgnorable(set, localStatus);
  1.4192 +        }
  1.4193 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
  1.4194 +            set->add(0, 0x1F).add(0x7F, 0x9F);
  1.4195 +        }
  1.4196 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
  1.4197 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4198 +            addCategory(set, U_GC_SC_MASK, localStatus);
  1.4199 +            addCategory(set, U_GC_PC_MASK, localStatus);
  1.4200 +            addCategory(set, U_GC_ND_MASK, localStatus);
  1.4201 +            addCategory(set, U_GC_NL_MASK, localStatus);
  1.4202 +            addCategory(set, U_GC_MC_MASK, localStatus);
  1.4203 +            addCategory(set, U_GC_MN_MASK, localStatus);
  1.4204 +            addIdentifierIgnorable(set, localStatus);
  1.4205 +        }
  1.4206 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
  1.4207 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4208 +            addCategory(set, U_GC_NL_MASK, localStatus);
  1.4209 +            addCategory(set, U_GC_SC_MASK, localStatus);
  1.4210 +            addCategory(set, U_GC_PC_MASK, localStatus);
  1.4211 +        }
  1.4212 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
  1.4213 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4214 +        }
  1.4215 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
  1.4216 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4217 +            addCategory(set, U_GC_ND_MASK, localStatus);
  1.4218 +        }
  1.4219 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
  1.4220 +            addCategory(set, U_GC_LL_MASK, localStatus);
  1.4221 +        }
  1.4222 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
  1.4223 +            set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
  1.4224 +        }
  1.4225 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
  1.4226 +            addCategory(set, U_GC_Z_MASK, localStatus);
  1.4227 +        }
  1.4228 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
  1.4229 +            set->add(0x10000, UnicodeSet::MAX_VALUE);
  1.4230 +        }
  1.4231 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
  1.4232 +            addCategory(set, U_GC_LT_MASK, localStatus);
  1.4233 +        }
  1.4234 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
  1.4235 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4236 +            addCategory(set, U_GC_NL_MASK, localStatus);
  1.4237 +        }
  1.4238 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
  1.4239 +            addCategory(set, U_GC_L_MASK, localStatus);
  1.4240 +            addCategory(set, U_GC_PC_MASK, localStatus);
  1.4241 +            addCategory(set, U_GC_ND_MASK, localStatus);
  1.4242 +            addCategory(set, U_GC_NL_MASK, localStatus);
  1.4243 +            addCategory(set, U_GC_MC_MASK, localStatus);
  1.4244 +            addCategory(set, U_GC_MN_MASK, localStatus);
  1.4245 +            addIdentifierIgnorable(set, localStatus);
  1.4246 +        }
  1.4247 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
  1.4248 +            addCategory(set, U_GC_LU_MASK, localStatus);
  1.4249 +        }
  1.4250 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
  1.4251 +            set->add(0, UnicodeSet::MAX_VALUE);
  1.4252 +        }
  1.4253 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
  1.4254 +            addCategory(set, U_GC_Z_MASK, localStatus);
  1.4255 +            set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
  1.4256 +            set->add(9, 0x0d).add(0x1c, 0x1f);
  1.4257 +        }
  1.4258 +        else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
  1.4259 +            set->add(0, UnicodeSet::MAX_VALUE);
  1.4260 +        }
  1.4261 +
  1.4262 +        if (U_SUCCESS(localStatus) && !set->isEmpty()) {
  1.4263 +            *fStatus = U_ZERO_ERROR;
  1.4264 +            if (usetFlags & USET_CASE_INSENSITIVE) {
  1.4265 +                set->closeOver(USET_CASE_INSENSITIVE);
  1.4266 +            }
  1.4267 +            if (negated) {
  1.4268 +                set->complement();
  1.4269 +            }
  1.4270 +            return set;
  1.4271 +        }
  1.4272 +        delete set;
  1.4273 +        set = NULL;
  1.4274 +    }
  1.4275 +    error(*fStatus);
  1.4276 +    return NULL; 
  1.4277 +}
  1.4278 +
  1.4279 +
  1.4280 +
  1.4281 +//
  1.4282 +//  SetEval   Part of the evaluation of [set expressions].
  1.4283 +//            Perform any pending (stacked) operations with precedence
  1.4284 +//            equal or greater to that of the next operator encountered
  1.4285 +//            in the expression.
  1.4286 +//
  1.4287 +void RegexCompile::setEval(int32_t nextOp) {
  1.4288 +    UnicodeSet *rightOperand = NULL;
  1.4289 +    UnicodeSet *leftOperand  = NULL;
  1.4290 +    for (;;) {
  1.4291 +        U_ASSERT(fSetOpStack.empty()==FALSE);
  1.4292 +        int32_t pendingSetOperation = fSetOpStack.peeki();
  1.4293 +        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
  1.4294 +            break;
  1.4295 +        }
  1.4296 +        fSetOpStack.popi();
  1.4297 +        U_ASSERT(fSetStack.empty() == FALSE);
  1.4298 +        rightOperand = (UnicodeSet *)fSetStack.peek();
  1.4299 +        switch (pendingSetOperation) {
  1.4300 +            case setNegation:
  1.4301 +                rightOperand->complement();
  1.4302 +                break;
  1.4303 +            case setCaseClose:
  1.4304 +                // TODO: need a simple close function.  Ticket 6065
  1.4305 +                rightOperand->closeOver(USET_CASE_INSENSITIVE);
  1.4306 +                rightOperand->removeAllStrings();
  1.4307 +                break;
  1.4308 +            case setDifference1:
  1.4309 +            case setDifference2:
  1.4310 +                fSetStack.pop();
  1.4311 +                leftOperand = (UnicodeSet *)fSetStack.peek();
  1.4312 +                leftOperand->removeAll(*rightOperand);
  1.4313 +                delete rightOperand;
  1.4314 +                break;
  1.4315 +            case setIntersection1:
  1.4316 +            case setIntersection2:
  1.4317 +                fSetStack.pop();
  1.4318 +                leftOperand = (UnicodeSet *)fSetStack.peek();
  1.4319 +                leftOperand->retainAll(*rightOperand);
  1.4320 +                delete rightOperand;
  1.4321 +                break;
  1.4322 +            case setUnion:
  1.4323 +                fSetStack.pop();
  1.4324 +                leftOperand = (UnicodeSet *)fSetStack.peek();
  1.4325 +                leftOperand->addAll(*rightOperand);
  1.4326 +                delete rightOperand;
  1.4327 +                break;
  1.4328 +            default:
  1.4329 +                U_ASSERT(FALSE);
  1.4330 +                break;
  1.4331 +            }
  1.4332 +        }
  1.4333 +    }
  1.4334 +
  1.4335 +void RegexCompile::setPushOp(int32_t op) {
  1.4336 +    setEval(op);
  1.4337 +    fSetOpStack.push(op, *fStatus);
  1.4338 +    fSetStack.push(new UnicodeSet(), *fStatus);
  1.4339 +}
  1.4340 +
  1.4341 +U_NAMESPACE_END
  1.4342 +#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
  1.4343 +

mercurial