1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/regexcmp.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,4340 @@ 1.4 +// 1.5 +// file: regexcmp.cpp 1.6 +// 1.7 +// Copyright (C) 2002-2013 International Business Machines Corporation and others. 1.8 +// All Rights Reserved. 1.9 +// 1.10 +// This file contains the ICU regular expression compiler, which is responsible 1.11 +// for processing a regular expression pattern into the compiled form that 1.12 +// is used by the match finding engine. 1.13 +// 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 + 1.17 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.18 + 1.19 +#include "unicode/ustring.h" 1.20 +#include "unicode/unistr.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/uchar.h" 1.23 +#include "unicode/uchriter.h" 1.24 +#include "unicode/parsepos.h" 1.25 +#include "unicode/parseerr.h" 1.26 +#include "unicode/regex.h" 1.27 +#include "unicode/utf.h" 1.28 +#include "unicode/utf16.h" 1.29 +#include "patternprops.h" 1.30 +#include "putilimp.h" 1.31 +#include "cmemory.h" 1.32 +#include "cstring.h" 1.33 +#include "uvectr32.h" 1.34 +#include "uvectr64.h" 1.35 +#include "uassert.h" 1.36 +#include "ucln_in.h" 1.37 +#include "uinvchar.h" 1.38 + 1.39 +#include "regeximp.h" 1.40 +#include "regexcst.h" // Contains state table for the regex pattern parser. 1.41 + // generated by a Perl script. 1.42 +#include "regexcmp.h" 1.43 +#include "regexst.h" 1.44 +#include "regextxt.h" 1.45 + 1.46 + 1.47 + 1.48 +U_NAMESPACE_BEGIN 1.49 + 1.50 + 1.51 +//------------------------------------------------------------------------------ 1.52 +// 1.53 +// Constructor. 1.54 +// 1.55 +//------------------------------------------------------------------------------ 1.56 +RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 1.57 + fParenStack(status), fSetStack(status), fSetOpStack(status) 1.58 +{ 1.59 + // Lazy init of all shared global sets (needed for init()'s empty text) 1.60 + RegexStaticSets::initGlobals(&status); 1.61 + 1.62 + fStatus = &status; 1.63 + 1.64 + fRXPat = rxp; 1.65 + fScanIndex = 0; 1.66 + fLastChar = -1; 1.67 + fPeekChar = -1; 1.68 + fLineNum = 1; 1.69 + fCharNum = 0; 1.70 + fQuoteMode = FALSE; 1.71 + fInBackslashQuote = FALSE; 1.72 + fModeFlags = fRXPat->fFlags | 0x80000000; 1.73 + fEOLComments = TRUE; 1.74 + 1.75 + fMatchOpenParen = -1; 1.76 + fMatchCloseParen = -1; 1.77 + 1.78 + if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 1.79 + status = rxp->fDeferredStatus; 1.80 + } 1.81 +} 1.82 + 1.83 +static const UChar chAmp = 0x26; // '&' 1.84 +static const UChar chDash = 0x2d; // '-' 1.85 + 1.86 + 1.87 +//------------------------------------------------------------------------------ 1.88 +// 1.89 +// Destructor 1.90 +// 1.91 +//------------------------------------------------------------------------------ 1.92 +RegexCompile::~RegexCompile() { 1.93 +} 1.94 + 1.95 +static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 1.96 + set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 1.97 +} 1.98 + 1.99 +//------------------------------------------------------------------------------ 1.100 +// 1.101 +// Compile regex pattern. The state machine for rexexp pattern parsing is here. 1.102 +// The state tables are hand-written in the file regexcst.txt, 1.103 +// and converted to the form used here by a perl 1.104 +// script regexcst.pl 1.105 +// 1.106 +//------------------------------------------------------------------------------ 1.107 +void RegexCompile::compile( 1.108 + const UnicodeString &pat, // Source pat to be compiled. 1.109 + UParseError &pp, // Error position info 1.110 + UErrorCode &e) // Error Code 1.111 +{ 1.112 + fRXPat->fPatternString = new UnicodeString(pat); 1.113 + UText patternText = UTEXT_INITIALIZER; 1.114 + utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 1.115 + 1.116 + if (U_SUCCESS(e)) { 1.117 + compile(&patternText, pp, e); 1.118 + utext_close(&patternText); 1.119 + } 1.120 +} 1.121 + 1.122 +// 1.123 +// compile, UText mode 1.124 +// All the work is actually done here. 1.125 +// 1.126 +void RegexCompile::compile( 1.127 + UText *pat, // Source pat to be compiled. 1.128 + UParseError &pp, // Error position info 1.129 + UErrorCode &e) // Error Code 1.130 +{ 1.131 + fStatus = &e; 1.132 + fParseErr = &pp; 1.133 + fStackPtr = 0; 1.134 + fStack[fStackPtr] = 0; 1.135 + 1.136 + if (U_FAILURE(*fStatus)) { 1.137 + return; 1.138 + } 1.139 + 1.140 + // There should be no pattern stuff in the RegexPattern object. They can not be reused. 1.141 + U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 1.142 + 1.143 + // Prepare the RegexPattern object to receive the compiled pattern. 1.144 + fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 1.145 + fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 1.146 + fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 1.147 + 1.148 + 1.149 + // Initialize the pattern scanning state machine 1.150 + fPatternLength = utext_nativeLength(pat); 1.151 + uint16_t state = 1; 1.152 + const RegexTableEl *tableEl; 1.153 + 1.154 + // UREGEX_LITERAL force entire pattern to be treated as a literal string. 1.155 + if (fModeFlags & UREGEX_LITERAL) { 1.156 + fQuoteMode = TRUE; 1.157 + } 1.158 + 1.159 + nextChar(fC); // Fetch the first char from the pattern string. 1.160 + 1.161 + // 1.162 + // Main loop for the regex pattern parsing state machine. 1.163 + // Runs once per state transition. 1.164 + // Each time through optionally performs, depending on the state table, 1.165 + // - an advance to the the next pattern char 1.166 + // - an action to be performed. 1.167 + // - pushing or popping a state to/from the local state return stack. 1.168 + // file regexcst.txt is the source for the state table. The logic behind 1.169 + // recongizing the pattern syntax is there, not here. 1.170 + // 1.171 + for (;;) { 1.172 + // Bail out if anything has gone wrong. 1.173 + // Regex pattern parsing stops on the first error encountered. 1.174 + if (U_FAILURE(*fStatus)) { 1.175 + break; 1.176 + } 1.177 + 1.178 + U_ASSERT(state != 0); 1.179 + 1.180 + // Find the state table element that matches the input char from the pattern, or the 1.181 + // class of the input character. Start with the first table row for this 1.182 + // state, then linearly scan forward until we find a row that matches the 1.183 + // character. The last row for each state always matches all characters, so 1.184 + // the search will stop there, if not before. 1.185 + // 1.186 + tableEl = &gRuleParseStateTable[state]; 1.187 + REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 1.188 + fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 1.189 + 1.190 + for (;;) { // loop through table rows belonging to this state, looking for one 1.191 + // that matches the current input char. 1.192 + REGEX_SCAN_DEBUG_PRINTF((".")); 1.193 + if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 1.194 + // Table row specified an individual character, not a set, and 1.195 + // the input character is not quoted, and 1.196 + // the input character matched it. 1.197 + break; 1.198 + } 1.199 + if (tableEl->fCharClass == 255) { 1.200 + // Table row specified default, match anything character class. 1.201 + break; 1.202 + } 1.203 + if (tableEl->fCharClass == 254 && fC.fQuoted) { 1.204 + // Table row specified "quoted" and the char was quoted. 1.205 + break; 1.206 + } 1.207 + if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 1.208 + // Table row specified eof and we hit eof on the input. 1.209 + break; 1.210 + } 1.211 + 1.212 + if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 1.213 + fC.fQuoted == FALSE && // char is not escaped && 1.214 + fC.fChar != (UChar32)-1) { // char is not EOF 1.215 + U_ASSERT(tableEl->fCharClass <= 137); 1.216 + if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 1.217 + // Table row specified a character class, or set of characters, 1.218 + // and the current char matches it. 1.219 + break; 1.220 + } 1.221 + } 1.222 + 1.223 + // No match on this row, advance to the next row for this state, 1.224 + tableEl++; 1.225 + } 1.226 + REGEX_SCAN_DEBUG_PRINTF(("\n")); 1.227 + 1.228 + // 1.229 + // We've found the row of the state table that matches the current input 1.230 + // character from the rules string. 1.231 + // Perform any action specified by this row in the state table. 1.232 + if (doParseActions(tableEl->fAction) == FALSE) { 1.233 + // Break out of the state machine loop if the 1.234 + // the action signalled some kind of error, or 1.235 + // the action was to exit, occurs on normal end-of-rules-input. 1.236 + break; 1.237 + } 1.238 + 1.239 + if (tableEl->fPushState != 0) { 1.240 + fStackPtr++; 1.241 + if (fStackPtr >= kStackSize) { 1.242 + error(U_REGEX_INTERNAL_ERROR); 1.243 + REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 1.244 + fStackPtr--; 1.245 + } 1.246 + fStack[fStackPtr] = tableEl->fPushState; 1.247 + } 1.248 + 1.249 + // 1.250 + // NextChar. This is where characters are actually fetched from the pattern. 1.251 + // Happens under control of the 'n' tag in the state table. 1.252 + // 1.253 + if (tableEl->fNextChar) { 1.254 + nextChar(fC); 1.255 + } 1.256 + 1.257 + // Get the next state from the table entry, or from the 1.258 + // state stack if the next state was specified as "pop". 1.259 + if (tableEl->fNextState != 255) { 1.260 + state = tableEl->fNextState; 1.261 + } else { 1.262 + state = fStack[fStackPtr]; 1.263 + fStackPtr--; 1.264 + if (fStackPtr < 0) { 1.265 + // state stack underflow 1.266 + // This will occur if the user pattern has mis-matched parentheses, 1.267 + // with extra close parens. 1.268 + // 1.269 + fStackPtr++; 1.270 + error(U_REGEX_MISMATCHED_PAREN); 1.271 + } 1.272 + } 1.273 + 1.274 + } 1.275 + 1.276 + if (U_FAILURE(*fStatus)) { 1.277 + // Bail out if the pattern had errors. 1.278 + // Set stack cleanup: a successful compile would have left it empty, 1.279 + // but errors can leave temporary sets hanging around. 1.280 + while (!fSetStack.empty()) { 1.281 + delete (UnicodeSet *)fSetStack.pop(); 1.282 + } 1.283 + return; 1.284 + } 1.285 + 1.286 + // 1.287 + // The pattern has now been read and processed, and the compiled code generated. 1.288 + // 1.289 + 1.290 + // 1.291 + // Compute the number of digits requried for the largest capture group number. 1.292 + // 1.293 + fRXPat->fMaxCaptureDigits = 1; 1.294 + int32_t n = 10; 1.295 + int32_t groupCount = fRXPat->fGroupMap->size(); 1.296 + while (n <= groupCount) { 1.297 + fRXPat->fMaxCaptureDigits++; 1.298 + n *= 10; 1.299 + } 1.300 + 1.301 + // 1.302 + // The pattern's fFrameSize so far has accumulated the requirements for 1.303 + // storage for capture parentheses, counters, etc. that are encountered 1.304 + // in the pattern. Add space for the two variables that are always 1.305 + // present in the saved state: the input string position (int64_t) and 1.306 + // the position in the compiled pattern. 1.307 + // 1.308 + fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; 1.309 + 1.310 + // 1.311 + // Optimization pass 1: NOPs, back-references, and case-folding 1.312 + // 1.313 + stripNOPs(); 1.314 + 1.315 + // 1.316 + // Get bounds for the minimum and maximum length of a string that this 1.317 + // pattern can match. Used to avoid looking for matches in strings that 1.318 + // are too short. 1.319 + // 1.320 + fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 1.321 + 1.322 + // 1.323 + // Optimization pass 2: match start type 1.324 + // 1.325 + matchStartType(); 1.326 + 1.327 + // 1.328 + // Set up fast latin-1 range sets 1.329 + // 1.330 + int32_t numSets = fRXPat->fSets->size(); 1.331 + fRXPat->fSets8 = new Regex8BitSet[numSets]; 1.332 + // Null pointer check. 1.333 + if (fRXPat->fSets8 == NULL) { 1.334 + e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.335 + return; 1.336 + } 1.337 + int32_t i; 1.338 + for (i=0; i<numSets; i++) { 1.339 + UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 1.340 + fRXPat->fSets8[i].init(s); 1.341 + } 1.342 + 1.343 +} 1.344 + 1.345 + 1.346 + 1.347 + 1.348 + 1.349 +//------------------------------------------------------------------------------ 1.350 +// 1.351 +// doParseAction Do some action during regex pattern parsing. 1.352 +// Called by the parse state machine. 1.353 +// 1.354 +// Generation of the match engine PCode happens here, or 1.355 +// in functions called from the parse actions defined here. 1.356 +// 1.357 +// 1.358 +//------------------------------------------------------------------------------ 1.359 +UBool RegexCompile::doParseActions(int32_t action) 1.360 +{ 1.361 + UBool returnVal = TRUE; 1.362 + 1.363 + switch ((Regex_PatternParseAction)action) { 1.364 + 1.365 + case doPatStart: 1.366 + // Start of pattern compiles to: 1.367 + //0 SAVE 2 Fall back to position of FAIL 1.368 + //1 jmp 3 1.369 + //2 FAIL Stop if we ever reach here. 1.370 + //3 NOP Dummy, so start of pattern looks the same as 1.371 + // the start of an ( grouping. 1.372 + //4 NOP Resreved, will be replaced by a save if there are 1.373 + // OR | operators at the top level 1.374 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); 1.375 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); 1.376 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); 1.377 + 1.378 + // Standard open nonCapture paren action emits the two NOPs and 1.379 + // sets up the paren stack frame. 1.380 + doParseActions(doOpenNonCaptureParen); 1.381 + break; 1.382 + 1.383 + case doPatFinish: 1.384 + // We've scanned to the end of the pattern 1.385 + // The end of pattern compiles to: 1.386 + // URX_END 1.387 + // which will stop the runtime match engine. 1.388 + // Encountering end of pattern also behaves like a close paren, 1.389 + // and forces fixups of the State Save at the beginning of the compiled pattern 1.390 + // and of any OR operations at the top level. 1.391 + // 1.392 + handleCloseParen(); 1.393 + if (fParenStack.size() > 0) { 1.394 + // Missing close paren in pattern. 1.395 + error(U_REGEX_MISMATCHED_PAREN); 1.396 + } 1.397 + 1.398 + // add the END operation to the compiled pattern. 1.399 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); 1.400 + 1.401 + // Terminate the pattern compilation state machine. 1.402 + returnVal = FALSE; 1.403 + break; 1.404 + 1.405 + 1.406 + 1.407 + case doOrOperator: 1.408 + // Scanning a '|', as in (A|B) 1.409 + { 1.410 + // Generate code for any pending literals preceding the '|' 1.411 + fixLiterals(FALSE); 1.412 + 1.413 + // Insert a SAVE operation at the start of the pattern section preceding 1.414 + // this OR at this level. This SAVE will branch the match forward 1.415 + // to the right hand side of the OR in the event that the left hand 1.416 + // side fails to match and backtracks. Locate the position for the 1.417 + // save from the location on the top of the parentheses stack. 1.418 + int32_t savePosition = fParenStack.popi(); 1.419 + int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 1.420 + U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 1.421 + op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 1.422 + fRXPat->fCompiledPat->setElementAt(op, savePosition); 1.423 + 1.424 + // Append an JMP operation into the compiled pattern. The operand for 1.425 + // the JMP will eventually be the location following the ')' for the 1.426 + // group. This will be patched in later, when the ')' is encountered. 1.427 + op = URX_BUILD(URX_JMP, 0); 1.428 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.429 + 1.430 + // Push the position of the newly added JMP op onto the parentheses stack. 1.431 + // This registers if for fixup when this block's close paren is encountered. 1.432 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 1.433 + 1.434 + // Append a NOP to the compiled pattern. This is the slot reserved 1.435 + // for a SAVE in the event that there is yet another '|' following 1.436 + // this one. 1.437 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.438 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 1.439 + } 1.440 + break; 1.441 + 1.442 + 1.443 + case doOpenCaptureParen: 1.444 + // Open Paren. 1.445 + // Compile to a 1.446 + // - NOP, which later may be replaced by a save-state if the 1.447 + // parenthesized group gets a * quantifier, followed by 1.448 + // - START_CAPTURE n where n is stack frame offset to the capture group variables. 1.449 + // - NOP, which may later be replaced by a save-state if there 1.450 + // is an '|' alternation within the parens. 1.451 + // 1.452 + // Each capture group gets three slots in the save stack frame: 1.453 + // 0: Capture Group start position (in input string being matched.) 1.454 + // 1: Capture Group end position. 1.455 + // 2: Start of Match-in-progress. 1.456 + // The first two locations are for a completed capture group, and are 1.457 + // referred to by back references and the like. 1.458 + // The third location stores the capture start position when an START_CAPTURE is 1.459 + // encountered. This will be promoted to a completed capture when (and if) the corresponding 1.460 + // END_CAPTURE is encountered. 1.461 + { 1.462 + fixLiterals(); 1.463 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.464 + int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. 1.465 + fRXPat->fFrameSize += 3; 1.466 + int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 1.467 + fRXPat->fCompiledPat->addElement(cop, *fStatus); 1.468 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.469 + 1.470 + // On the Parentheses stack, start a new frame and add the postions 1.471 + // of the two NOPs. Depending on what follows in the pattern, the 1.472 + // NOPs may be changed to SAVE_STATE or JMP ops, with a target 1.473 + // address of the end of the parenthesized group. 1.474 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.475 + fParenStack.push(capturing, *fStatus); // Frame type. 1.476 + fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 1.477 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 1.478 + 1.479 + // Save the mapping from group number to stack frame variable position. 1.480 + fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 1.481 + } 1.482 + break; 1.483 + 1.484 + case doOpenNonCaptureParen: 1.485 + // Open non-caputuring (grouping only) Paren. 1.486 + // Compile to a 1.487 + // - NOP, which later may be replaced by a save-state if the 1.488 + // parenthesized group gets a * quantifier, followed by 1.489 + // - NOP, which may later be replaced by a save-state if there 1.490 + // is an '|' alternation within the parens. 1.491 + { 1.492 + fixLiterals(); 1.493 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.494 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.495 + 1.496 + // On the Parentheses stack, start a new frame and add the postions 1.497 + // of the two NOPs. 1.498 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.499 + fParenStack.push(plain, *fStatus); // Begin a new frame. 1.500 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 1.501 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 1.502 + } 1.503 + break; 1.504 + 1.505 + 1.506 + case doOpenAtomicParen: 1.507 + // Open Atomic Paren. (?> 1.508 + // Compile to a 1.509 + // - NOP, which later may be replaced if the parenthesized group 1.510 + // has a quantifier, followed by 1.511 + // - STO_SP save state stack position, so it can be restored at the ")" 1.512 + // - NOP, which may later be replaced by a save-state if there 1.513 + // is an '|' alternation within the parens. 1.514 + { 1.515 + fixLiterals(); 1.516 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.517 + int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1.518 + fRXPat->fDataSize += 1; // state stack ptr. 1.519 + int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 1.520 + fRXPat->fCompiledPat->addElement(stoOp, *fStatus); 1.521 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.522 + 1.523 + // On the Parentheses stack, start a new frame and add the postions 1.524 + // of the two NOPs. Depending on what follows in the pattern, the 1.525 + // NOPs may be changed to SAVE_STATE or JMP ops, with a target 1.526 + // address of the end of the parenthesized group. 1.527 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.528 + fParenStack.push(atomic, *fStatus); // Frame type. 1.529 + fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 1.530 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1.531 + } 1.532 + break; 1.533 + 1.534 + 1.535 + case doOpenLookAhead: 1.536 + // Positive Look-ahead (?= stuff ) 1.537 + // 1.538 + // Note: Addition of transparent input regions, with the need to 1.539 + // restore the original regions when failing out of a lookahead 1.540 + // block, complicated this sequence. Some conbined opcodes 1.541 + // might make sense - or might not, lookahead aren't that common. 1.542 + // 1.543 + // Caution: min match length optimization knows about this 1.544 + // sequence; don't change without making updates there too. 1.545 + // 1.546 + // Compiles to 1.547 + // 1 START_LA dataLoc Saves SP, Input Pos 1.548 + // 2. STATE_SAVE 4 on failure of lookahead, goto 4 1.549 + // 3 JMP 6 continue ... 1.550 + // 1.551 + // 4. LA_END Look Ahead failed. Restore regions. 1.552 + // 5. BACKTRACK and back track again. 1.553 + // 1.554 + // 6. NOP reserved for use by quantifiers on the block. 1.555 + // Look-ahead can't have quantifiers, but paren stack 1.556 + // compile time conventions require the slot anyhow. 1.557 + // 7. NOP may be replaced if there is are '|' ops in the block. 1.558 + // 8. code for parenthesized stuff. 1.559 + // 9. LA_END 1.560 + // 1.561 + // Two data slots are reserved, for saving the stack ptr and the input position. 1.562 + { 1.563 + fixLiterals(); 1.564 + int32_t dataLoc = fRXPat->fDataSize; 1.565 + fRXPat->fDataSize += 2; 1.566 + int32_t op = URX_BUILD(URX_LA_START, dataLoc); 1.567 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.568 + 1.569 + op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 1.570 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.571 + 1.572 + op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 1.573 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.574 + 1.575 + op = URX_BUILD(URX_LA_END, dataLoc); 1.576 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.577 + 1.578 + op = URX_BUILD(URX_BACKTRACK, 0); 1.579 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.580 + 1.581 + op = URX_BUILD(URX_NOP, 0); 1.582 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.583 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.584 + 1.585 + // On the Parentheses stack, start a new frame and add the postions 1.586 + // of the NOPs. 1.587 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.588 + fParenStack.push(lookAhead, *fStatus); // Frame type. 1.589 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 1.590 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 1.591 + } 1.592 + break; 1.593 + 1.594 + case doOpenLookAheadNeg: 1.595 + // Negated Lookahead. (?! stuff ) 1.596 + // Compiles to 1.597 + // 1. START_LA dataloc 1.598 + // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 1.599 + // // which continues with the match. 1.600 + // 3. NOP // Std. Open Paren sequence, for possible '|' 1.601 + // 4. code for parenthesized stuff. 1.602 + // 5. END_LA // Cut back stack, remove saved state from step 2. 1.603 + // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 1.604 + // 7. END_LA // Restore match region, in case look-ahead was using 1.605 + // an alternate (transparent) region. 1.606 + { 1.607 + fixLiterals(); 1.608 + int32_t dataLoc = fRXPat->fDataSize; 1.609 + fRXPat->fDataSize += 2; 1.610 + int32_t op = URX_BUILD(URX_LA_START, dataLoc); 1.611 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.612 + 1.613 + op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later. 1.614 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.615 + 1.616 + op = URX_BUILD(URX_NOP, 0); 1.617 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.618 + 1.619 + // On the Parentheses stack, start a new frame and add the postions 1.620 + // of the StateSave and NOP. 1.621 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.622 + fParenStack.push(negLookAhead, *fStatus); // Frame type 1.623 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 1.624 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 1.625 + 1.626 + // Instructions #5 - #7 will be added when the ')' is encountered. 1.627 + } 1.628 + break; 1.629 + 1.630 + case doOpenLookBehind: 1.631 + { 1.632 + // Compile a (?<= look-behind open paren. 1.633 + // 1.634 + // Compiles to 1.635 + // 0 URX_LB_START dataLoc 1.636 + // 1 URX_LB_CONT dataLoc 1.637 + // 2 MinMatchLen 1.638 + // 3 MaxMatchLen 1.639 + // 4 URX_NOP Standard '(' boilerplate. 1.640 + // 5 URX_NOP Reserved slot for use with '|' ops within (block). 1.641 + // 6 <code for LookBehind expression> 1.642 + // 7 URX_LB_END dataLoc # Check match len, restore input len 1.643 + // 8 URX_LA_END dataLoc # Restore stack, input pos 1.644 + // 1.645 + // Allocate a block of matcher data, to contain (when running a match) 1.646 + // 0: Stack ptr on entry 1.647 + // 1: Input Index on entry 1.648 + // 2: Start index of match current match attempt. 1.649 + // 3: Original Input String len. 1.650 + 1.651 + // Generate match code for any pending literals. 1.652 + fixLiterals(); 1.653 + 1.654 + // Allocate data space 1.655 + int32_t dataLoc = fRXPat->fDataSize; 1.656 + fRXPat->fDataSize += 4; 1.657 + 1.658 + // Emit URX_LB_START 1.659 + int32_t op = URX_BUILD(URX_LB_START, dataLoc); 1.660 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.661 + 1.662 + // Emit URX_LB_CONT 1.663 + op = URX_BUILD(URX_LB_CONT, dataLoc); 1.664 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.665 + fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 1.666 + fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 1.667 + 1.668 + // Emit the NOP 1.669 + op = URX_BUILD(URX_NOP, 0); 1.670 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.671 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.672 + 1.673 + // On the Parentheses stack, start a new frame and add the postions 1.674 + // of the URX_LB_CONT and the NOP. 1.675 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.676 + fParenStack.push(lookBehind, *fStatus); // Frame type 1.677 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 1.678 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 1.679 + 1.680 + // The final two instructions will be added when the ')' is encountered. 1.681 + } 1.682 + 1.683 + break; 1.684 + 1.685 + case doOpenLookBehindNeg: 1.686 + { 1.687 + // Compile a (?<! negated look-behind open paren. 1.688 + // 1.689 + // Compiles to 1.690 + // 0 URX_LB_START dataLoc # Save entry stack, input len 1.691 + // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 1.692 + // 2 MinMatchLen 1.693 + // 3 MaxMatchLen 1.694 + // 4 continueLoc (9) 1.695 + // 5 URX_NOP Standard '(' boilerplate. 1.696 + // 6 URX_NOP Reserved slot for use with '|' ops within (block). 1.697 + // 7 <code for LookBehind expression> 1.698 + // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 1.699 + // 9 ... 1.700 + // 1.701 + // Allocate a block of matcher data, to contain (when running a match) 1.702 + // 0: Stack ptr on entry 1.703 + // 1: Input Index on entry 1.704 + // 2: Start index of match current match attempt. 1.705 + // 3: Original Input String len. 1.706 + 1.707 + // Generate match code for any pending literals. 1.708 + fixLiterals(); 1.709 + 1.710 + // Allocate data space 1.711 + int32_t dataLoc = fRXPat->fDataSize; 1.712 + fRXPat->fDataSize += 4; 1.713 + 1.714 + // Emit URX_LB_START 1.715 + int32_t op = URX_BUILD(URX_LB_START, dataLoc); 1.716 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.717 + 1.718 + // Emit URX_LBN_CONT 1.719 + op = URX_BUILD(URX_LBN_CONT, dataLoc); 1.720 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.721 + fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 1.722 + fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 1.723 + fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. 1.724 + 1.725 + // Emit the NOP 1.726 + op = URX_BUILD(URX_NOP, 0); 1.727 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.728 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.729 + 1.730 + // On the Parentheses stack, start a new frame and add the postions 1.731 + // of the URX_LB_CONT and the NOP. 1.732 + fParenStack.push(fModeFlags, *fStatus); // Match mode state 1.733 + fParenStack.push(lookBehindN, *fStatus); // Frame type 1.734 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 1.735 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 1.736 + 1.737 + // The final two instructions will be added when the ')' is encountered. 1.738 + } 1.739 + break; 1.740 + 1.741 + case doConditionalExpr: 1.742 + // Conditionals such as (?(1)a:b) 1.743 + case doPerlInline: 1.744 + // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 1.745 + error(U_REGEX_UNIMPLEMENTED); 1.746 + break; 1.747 + 1.748 + 1.749 + case doCloseParen: 1.750 + handleCloseParen(); 1.751 + if (fParenStack.size() <= 0) { 1.752 + // Extra close paren, or missing open paren. 1.753 + error(U_REGEX_MISMATCHED_PAREN); 1.754 + } 1.755 + break; 1.756 + 1.757 + case doNOP: 1.758 + break; 1.759 + 1.760 + 1.761 + case doBadOpenParenType: 1.762 + case doRuleError: 1.763 + error(U_REGEX_RULE_SYNTAX); 1.764 + break; 1.765 + 1.766 + 1.767 + case doMismatchedParenErr: 1.768 + error(U_REGEX_MISMATCHED_PAREN); 1.769 + break; 1.770 + 1.771 + case doPlus: 1.772 + // Normal '+' compiles to 1.773 + // 1. stuff to be repeated (already built) 1.774 + // 2. jmp-sav 1 1.775 + // 3. ... 1.776 + // 1.777 + // Or, if the item to be repeated can match a zero length string, 1.778 + // 1. STO_INP_LOC data-loc 1.779 + // 2. body of stuff to be repeated 1.780 + // 3. JMP_SAV_X 2 1.781 + // 4. ... 1.782 + 1.783 + // 1.784 + // Or, if the item to be repeated is simple 1.785 + // 1. Item to be repeated. 1.786 + // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 1.787 + // 3. LOOP_C stack location 1.788 + { 1.789 + int32_t topLoc = blockTopLoc(FALSE); // location of item #1 1.790 + int32_t frameLoc; 1.791 + 1.792 + // Check for simple constructs, which may get special optimized code. 1.793 + if (topLoc == fRXPat->fCompiledPat->size() - 1) { 1.794 + int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 1.795 + 1.796 + if (URX_TYPE(repeatedOp) == URX_SETREF) { 1.797 + // Emit optimized code for [char set]+ 1.798 + int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 1.799 + fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 1.800 + frameLoc = fRXPat->fFrameSize; 1.801 + fRXPat->fFrameSize++; 1.802 + int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 1.803 + fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 1.804 + break; 1.805 + } 1.806 + 1.807 + if (URX_TYPE(repeatedOp) == URX_DOTANY || 1.808 + URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 1.809 + URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 1.810 + // Emit Optimized code for .+ operations. 1.811 + int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 1.812 + if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 1.813 + // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 1.814 + loopOpI |= 1; 1.815 + } 1.816 + if (fModeFlags & UREGEX_UNIX_LINES) { 1.817 + loopOpI |= 2; 1.818 + } 1.819 + fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 1.820 + frameLoc = fRXPat->fFrameSize; 1.821 + fRXPat->fFrameSize++; 1.822 + int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 1.823 + fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 1.824 + break; 1.825 + } 1.826 + 1.827 + } 1.828 + 1.829 + // General case. 1.830 + 1.831 + // Check for minimum match length of zero, which requires 1.832 + // extra loop-breaking code. 1.833 + if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 1.834 + // Zero length match is possible. 1.835 + // Emit the code sequence that can handle it. 1.836 + insertOp(topLoc); 1.837 + frameLoc = fRXPat->fFrameSize; 1.838 + fRXPat->fFrameSize++; 1.839 + 1.840 + int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 1.841 + fRXPat->fCompiledPat->setElementAt(op, topLoc); 1.842 + 1.843 + op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 1.844 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.845 + } else { 1.846 + // Simpler code when the repeated body must match something non-empty 1.847 + int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 1.848 + fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 1.849 + } 1.850 + } 1.851 + break; 1.852 + 1.853 + case doNGPlus: 1.854 + // Non-greedy '+?' compiles to 1.855 + // 1. stuff to be repeated (already built) 1.856 + // 2. state-save 1 1.857 + // 3. ... 1.858 + { 1.859 + int32_t topLoc = blockTopLoc(FALSE); 1.860 + int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 1.861 + fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); 1.862 + } 1.863 + break; 1.864 + 1.865 + 1.866 + case doOpt: 1.867 + // Normal (greedy) ? quantifier. 1.868 + // Compiles to 1.869 + // 1. state save 3 1.870 + // 2. body of optional block 1.871 + // 3. ... 1.872 + // Insert the state save into the compiled pattern, and we're done. 1.873 + { 1.874 + int32_t saveStateLoc = blockTopLoc(TRUE); 1.875 + int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 1.876 + fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 1.877 + } 1.878 + break; 1.879 + 1.880 + case doNGOpt: 1.881 + // Non-greedy ?? quantifier 1.882 + // compiles to 1.883 + // 1. jmp 4 1.884 + // 2. body of optional block 1.885 + // 3 jmp 5 1.886 + // 4. state save 2 1.887 + // 5 ... 1.888 + // This code is less than ideal, with two jmps instead of one, because we can only 1.889 + // insert one instruction at the top of the block being iterated. 1.890 + { 1.891 + int32_t jmp1_loc = blockTopLoc(TRUE); 1.892 + int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 1.893 + 1.894 + int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 1.895 + fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 1.896 + 1.897 + int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 1.898 + fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); 1.899 + 1.900 + int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 1.901 + fRXPat->fCompiledPat->addElement(save_op, *fStatus); 1.902 + } 1.903 + break; 1.904 + 1.905 + 1.906 + case doStar: 1.907 + // Normal (greedy) * quantifier. 1.908 + // Compiles to 1.909 + // 1. STATE_SAVE 4 1.910 + // 2. body of stuff being iterated over 1.911 + // 3. JMP_SAV 2 1.912 + // 4. ... 1.913 + // 1.914 + // Or, if the body is a simple [Set], 1.915 + // 1. LOOP_SR_I set number 1.916 + // 2. LOOP_C stack location 1.917 + // ... 1.918 + // 1.919 + // Or if this is a .* 1.920 + // 1. LOOP_DOT_I (. matches all mode flag) 1.921 + // 2. LOOP_C stack location 1.922 + // 1.923 + // Or, if the body can match a zero-length string, to inhibit infinite loops, 1.924 + // 1. STATE_SAVE 5 1.925 + // 2. STO_INP_LOC data-loc 1.926 + // 3. body of stuff 1.927 + // 4. JMP_SAV_X 2 1.928 + // 5. ... 1.929 + { 1.930 + // location of item #1, the STATE_SAVE 1.931 + int32_t topLoc = blockTopLoc(FALSE); 1.932 + int32_t dataLoc = -1; 1.933 + 1.934 + // Check for simple *, where the construct being repeated 1.935 + // compiled to single opcode, and might be optimizable. 1.936 + if (topLoc == fRXPat->fCompiledPat->size() - 1) { 1.937 + int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 1.938 + 1.939 + if (URX_TYPE(repeatedOp) == URX_SETREF) { 1.940 + // Emit optimized code for a [char set]* 1.941 + int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 1.942 + fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 1.943 + dataLoc = fRXPat->fFrameSize; 1.944 + fRXPat->fFrameSize++; 1.945 + int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 1.946 + fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 1.947 + break; 1.948 + } 1.949 + 1.950 + if (URX_TYPE(repeatedOp) == URX_DOTANY || 1.951 + URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 1.952 + URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 1.953 + // Emit Optimized code for .* operations. 1.954 + int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 1.955 + if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 1.956 + // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 1.957 + loopOpI |= 1; 1.958 + } 1.959 + if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 1.960 + loopOpI |= 2; 1.961 + } 1.962 + fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 1.963 + dataLoc = fRXPat->fFrameSize; 1.964 + fRXPat->fFrameSize++; 1.965 + int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 1.966 + fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 1.967 + break; 1.968 + } 1.969 + } 1.970 + 1.971 + // Emit general case code for this * 1.972 + // The optimizations did not apply. 1.973 + 1.974 + int32_t saveStateLoc = blockTopLoc(TRUE); 1.975 + int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 1.976 + 1.977 + // Check for minimum match length of zero, which requires 1.978 + // extra loop-breaking code. 1.979 + if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 1.980 + insertOp(saveStateLoc); 1.981 + dataLoc = fRXPat->fFrameSize; 1.982 + fRXPat->fFrameSize++; 1.983 + 1.984 + int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 1.985 + fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 1.986 + jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 1.987 + } 1.988 + 1.989 + // Locate the position in the compiled pattern where the match will continue 1.990 + // after completing the *. (4 or 5 in the comment above) 1.991 + int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1.992 + 1.993 + // Put together the save state op store it into the compiled code. 1.994 + int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 1.995 + fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 1.996 + 1.997 + // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 1.998 + fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 1.999 + } 1.1000 + break; 1.1001 + 1.1002 + case doNGStar: 1.1003 + // Non-greedy *? quantifier 1.1004 + // compiles to 1.1005 + // 1. JMP 3 1.1006 + // 2. body of stuff being iterated over 1.1007 + // 3. STATE_SAVE 2 1.1008 + // 4 ... 1.1009 + { 1.1010 + int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 1.1011 + int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 1.1012 + int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 1.1013 + int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 1.1014 + fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 1.1015 + fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); 1.1016 + } 1.1017 + break; 1.1018 + 1.1019 + 1.1020 + case doIntervalInit: 1.1021 + // The '{' opening an interval quantifier was just scanned. 1.1022 + // Init the counter varaiables that will accumulate the values as the digits 1.1023 + // are scanned. 1.1024 + fIntervalLow = 0; 1.1025 + fIntervalUpper = -1; 1.1026 + break; 1.1027 + 1.1028 + case doIntevalLowerDigit: 1.1029 + // Scanned a digit from the lower value of an {lower,upper} interval 1.1030 + { 1.1031 + int32_t digitValue = u_charDigitValue(fC.fChar); 1.1032 + U_ASSERT(digitValue >= 0); 1.1033 + fIntervalLow = fIntervalLow*10 + digitValue; 1.1034 + if (fIntervalLow < 0) { 1.1035 + error(U_REGEX_NUMBER_TOO_BIG); 1.1036 + } 1.1037 + } 1.1038 + break; 1.1039 + 1.1040 + case doIntervalUpperDigit: 1.1041 + // Scanned a digit from the upper value of an {lower,upper} interval 1.1042 + { 1.1043 + if (fIntervalUpper < 0) { 1.1044 + fIntervalUpper = 0; 1.1045 + } 1.1046 + int32_t digitValue = u_charDigitValue(fC.fChar); 1.1047 + U_ASSERT(digitValue >= 0); 1.1048 + fIntervalUpper = fIntervalUpper*10 + digitValue; 1.1049 + if (fIntervalUpper < 0) { 1.1050 + error(U_REGEX_NUMBER_TOO_BIG); 1.1051 + } 1.1052 + } 1.1053 + break; 1.1054 + 1.1055 + case doIntervalSame: 1.1056 + // Scanned a single value interval like {27}. Upper = Lower. 1.1057 + fIntervalUpper = fIntervalLow; 1.1058 + break; 1.1059 + 1.1060 + case doInterval: 1.1061 + // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1.1062 + if (compileInlineInterval() == FALSE) { 1.1063 + compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1.1064 + } 1.1065 + break; 1.1066 + 1.1067 + case doPossessiveInterval: 1.1068 + // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1.1069 + { 1.1070 + // Remember the loc for the top of the block being looped over. 1.1071 + // (Can not reserve a slot in the compiled pattern at this time, because 1.1072 + // compileInterval needs to reserve also, and blockTopLoc can only reserve 1.1073 + // once per block.) 1.1074 + int32_t topLoc = blockTopLoc(FALSE); 1.1075 + 1.1076 + // Produce normal looping code. 1.1077 + compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1.1078 + 1.1079 + // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1.1080 + // just as if the loop was inclosed in atomic parentheses. 1.1081 + 1.1082 + // First the STO_SP before the start of the loop 1.1083 + insertOp(topLoc); 1.1084 + int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1.1085 + fRXPat->fDataSize += 1; // state stack ptr. 1.1086 + int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1.1087 + fRXPat->fCompiledPat->setElementAt(op, topLoc); 1.1088 + 1.1089 + int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1.1090 + U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1.1091 + loopOp++; // point LoopOp after the just-inserted STO_SP 1.1092 + fRXPat->fCompiledPat->push(loopOp, *fStatus); 1.1093 + 1.1094 + // Then the LD_SP after the end of the loop 1.1095 + op = URX_BUILD(URX_LD_SP, varLoc); 1.1096 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1097 + } 1.1098 + 1.1099 + break; 1.1100 + 1.1101 + case doNGInterval: 1.1102 + // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1.1103 + compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1.1104 + break; 1.1105 + 1.1106 + case doIntervalError: 1.1107 + error(U_REGEX_BAD_INTERVAL); 1.1108 + break; 1.1109 + 1.1110 + case doLiteralChar: 1.1111 + // We've just scanned a "normal" character from the pattern, 1.1112 + literalChar(fC.fChar); 1.1113 + break; 1.1114 + 1.1115 + 1.1116 + case doEscapedLiteralChar: 1.1117 + // We've just scanned an backslashed escaped character with no 1.1118 + // special meaning. It represents itself. 1.1119 + if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1.1120 + ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1.1121 + (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1.1122 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.1123 + } 1.1124 + literalChar(fC.fChar); 1.1125 + break; 1.1126 + 1.1127 + 1.1128 + case doDotAny: 1.1129 + // scanned a ".", match any single character. 1.1130 + { 1.1131 + fixLiterals(FALSE); 1.1132 + int32_t op; 1.1133 + if (fModeFlags & UREGEX_DOTALL) { 1.1134 + op = URX_BUILD(URX_DOTANY_ALL, 0); 1.1135 + } else if (fModeFlags & UREGEX_UNIX_LINES) { 1.1136 + op = URX_BUILD(URX_DOTANY_UNIX, 0); 1.1137 + } else { 1.1138 + op = URX_BUILD(URX_DOTANY, 0); 1.1139 + } 1.1140 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1141 + } 1.1142 + break; 1.1143 + 1.1144 + case doCaret: 1.1145 + { 1.1146 + fixLiterals(FALSE); 1.1147 + int32_t op = 0; 1.1148 + if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1.1149 + op = URX_CARET; 1.1150 + } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1.1151 + op = URX_CARET_M; 1.1152 + } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1.1153 + op = URX_CARET; // Only testing true start of input. 1.1154 + } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1.1155 + op = URX_CARET_M_UNIX; 1.1156 + } 1.1157 + fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1.1158 + } 1.1159 + break; 1.1160 + 1.1161 + case doDollar: 1.1162 + { 1.1163 + fixLiterals(FALSE); 1.1164 + int32_t op = 0; 1.1165 + if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1.1166 + op = URX_DOLLAR; 1.1167 + } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1.1168 + op = URX_DOLLAR_M; 1.1169 + } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1.1170 + op = URX_DOLLAR_D; 1.1171 + } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1.1172 + op = URX_DOLLAR_MD; 1.1173 + } 1.1174 + fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1.1175 + } 1.1176 + break; 1.1177 + 1.1178 + case doBackslashA: 1.1179 + fixLiterals(FALSE); 1.1180 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); 1.1181 + break; 1.1182 + 1.1183 + case doBackslashB: 1.1184 + { 1.1185 + #if UCONFIG_NO_BREAK_ITERATION==1 1.1186 + if (fModeFlags & UREGEX_UWORD) { 1.1187 + error(U_UNSUPPORTED_ERROR); 1.1188 + } 1.1189 + #endif 1.1190 + fixLiterals(FALSE); 1.1191 + int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1.1192 + fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); 1.1193 + } 1.1194 + break; 1.1195 + 1.1196 + case doBackslashb: 1.1197 + { 1.1198 + #if UCONFIG_NO_BREAK_ITERATION==1 1.1199 + if (fModeFlags & UREGEX_UWORD) { 1.1200 + error(U_UNSUPPORTED_ERROR); 1.1201 + } 1.1202 + #endif 1.1203 + fixLiterals(FALSE); 1.1204 + int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1.1205 + fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1.1206 + } 1.1207 + break; 1.1208 + 1.1209 + case doBackslashD: 1.1210 + fixLiterals(FALSE); 1.1211 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); 1.1212 + break; 1.1213 + 1.1214 + case doBackslashd: 1.1215 + fixLiterals(FALSE); 1.1216 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); 1.1217 + break; 1.1218 + 1.1219 + case doBackslashG: 1.1220 + fixLiterals(FALSE); 1.1221 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); 1.1222 + break; 1.1223 + 1.1224 + case doBackslashS: 1.1225 + fixLiterals(FALSE); 1.1226 + fRXPat->fCompiledPat->addElement( 1.1227 + URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); 1.1228 + break; 1.1229 + 1.1230 + case doBackslashs: 1.1231 + fixLiterals(FALSE); 1.1232 + fRXPat->fCompiledPat->addElement( 1.1233 + URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); 1.1234 + break; 1.1235 + 1.1236 + case doBackslashW: 1.1237 + fixLiterals(FALSE); 1.1238 + fRXPat->fCompiledPat->addElement( 1.1239 + URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); 1.1240 + break; 1.1241 + 1.1242 + case doBackslashw: 1.1243 + fixLiterals(FALSE); 1.1244 + fRXPat->fCompiledPat->addElement( 1.1245 + URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); 1.1246 + break; 1.1247 + 1.1248 + case doBackslashX: 1.1249 + fixLiterals(FALSE); 1.1250 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); 1.1251 + break; 1.1252 + 1.1253 + 1.1254 + case doBackslashZ: 1.1255 + fixLiterals(FALSE); 1.1256 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); 1.1257 + break; 1.1258 + 1.1259 + case doBackslashz: 1.1260 + fixLiterals(FALSE); 1.1261 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); 1.1262 + break; 1.1263 + 1.1264 + case doEscapeError: 1.1265 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.1266 + break; 1.1267 + 1.1268 + case doExit: 1.1269 + fixLiterals(FALSE); 1.1270 + returnVal = FALSE; 1.1271 + break; 1.1272 + 1.1273 + case doProperty: 1.1274 + { 1.1275 + fixLiterals(FALSE); 1.1276 + UnicodeSet *theSet = scanProp(); 1.1277 + compileSet(theSet); 1.1278 + } 1.1279 + break; 1.1280 + 1.1281 + case doNamedChar: 1.1282 + { 1.1283 + UChar32 c = scanNamedChar(); 1.1284 + literalChar(c); 1.1285 + } 1.1286 + break; 1.1287 + 1.1288 + 1.1289 + case doBackRef: 1.1290 + // BackReference. Somewhat unusual in that the front-end can not completely parse 1.1291 + // the regular expression, because the number of digits to be consumed 1.1292 + // depends on the number of capture groups that have been defined. So 1.1293 + // we have to do it here instead. 1.1294 + { 1.1295 + int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1.1296 + int32_t groupNum = 0; 1.1297 + UChar32 c = fC.fChar; 1.1298 + 1.1299 + for (;;) { 1.1300 + // Loop once per digit, for max allowed number of digits in a back reference. 1.1301 + int32_t digit = u_charDigitValue(c); 1.1302 + groupNum = groupNum * 10 + digit; 1.1303 + if (groupNum >= numCaptureGroups) { 1.1304 + break; 1.1305 + } 1.1306 + c = peekCharLL(); 1.1307 + if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1.1308 + break; 1.1309 + } 1.1310 + nextCharLL(); 1.1311 + } 1.1312 + 1.1313 + // Scan of the back reference in the source regexp is complete. Now generate 1.1314 + // the compiled code for it. 1.1315 + // Because capture groups can be forward-referenced by back-references, 1.1316 + // we fill the operand with the capture group number. At the end 1.1317 + // of compilation, it will be changed to the variable's location. 1.1318 + U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1.1319 + // and shouldn't enter this code path at all. 1.1320 + fixLiterals(FALSE); 1.1321 + int32_t op; 1.1322 + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1.1323 + op = URX_BUILD(URX_BACKREF_I, groupNum); 1.1324 + } else { 1.1325 + op = URX_BUILD(URX_BACKREF, groupNum); 1.1326 + } 1.1327 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1328 + } 1.1329 + break; 1.1330 + 1.1331 + 1.1332 + case doPossessivePlus: 1.1333 + // Possessive ++ quantifier. 1.1334 + // Compiles to 1.1335 + // 1. STO_SP 1.1336 + // 2. body of stuff being iterated over 1.1337 + // 3. STATE_SAVE 5 1.1338 + // 4. JMP 2 1.1339 + // 5. LD_SP 1.1340 + // 6. ... 1.1341 + // 1.1342 + // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1.1343 + // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1.1344 + // 1.1345 + { 1.1346 + // Emit the STO_SP 1.1347 + int32_t topLoc = blockTopLoc(TRUE); 1.1348 + int32_t stoLoc = fRXPat->fDataSize; 1.1349 + fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1.1350 + int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1.1351 + fRXPat->fCompiledPat->setElementAt(op, topLoc); 1.1352 + 1.1353 + // Emit the STATE_SAVE 1.1354 + op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1.1355 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1356 + 1.1357 + // Emit the JMP 1.1358 + op = URX_BUILD(URX_JMP, topLoc+1); 1.1359 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1360 + 1.1361 + // Emit the LD_SP 1.1362 + op = URX_BUILD(URX_LD_SP, stoLoc); 1.1363 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1364 + } 1.1365 + break; 1.1366 + 1.1367 + case doPossessiveStar: 1.1368 + // Possessive *+ quantifier. 1.1369 + // Compiles to 1.1370 + // 1. STO_SP loc 1.1371 + // 2. STATE_SAVE 5 1.1372 + // 3. body of stuff being iterated over 1.1373 + // 4. JMP 2 1.1374 + // 5. LD_SP loc 1.1375 + // 6 ... 1.1376 + // TODO: do something to cut back the state stack each time through the loop. 1.1377 + { 1.1378 + // Reserve two slots at the top of the block. 1.1379 + int32_t topLoc = blockTopLoc(TRUE); 1.1380 + insertOp(topLoc); 1.1381 + 1.1382 + // emit STO_SP loc 1.1383 + int32_t stoLoc = fRXPat->fDataSize; 1.1384 + fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1.1385 + int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1.1386 + fRXPat->fCompiledPat->setElementAt(op, topLoc); 1.1387 + 1.1388 + // Emit the SAVE_STATE 5 1.1389 + int32_t L7 = fRXPat->fCompiledPat->size()+1; 1.1390 + op = URX_BUILD(URX_STATE_SAVE, L7); 1.1391 + fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1.1392 + 1.1393 + // Append the JMP operation. 1.1394 + op = URX_BUILD(URX_JMP, topLoc+1); 1.1395 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1396 + 1.1397 + // Emit the LD_SP loc 1.1398 + op = URX_BUILD(URX_LD_SP, stoLoc); 1.1399 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1400 + } 1.1401 + break; 1.1402 + 1.1403 + case doPossessiveOpt: 1.1404 + // Possessive ?+ quantifier. 1.1405 + // Compiles to 1.1406 + // 1. STO_SP loc 1.1407 + // 2. SAVE_STATE 5 1.1408 + // 3. body of optional block 1.1409 + // 4. LD_SP loc 1.1410 + // 5. ... 1.1411 + // 1.1412 + { 1.1413 + // Reserve two slots at the top of the block. 1.1414 + int32_t topLoc = blockTopLoc(TRUE); 1.1415 + insertOp(topLoc); 1.1416 + 1.1417 + // Emit the STO_SP 1.1418 + int32_t stoLoc = fRXPat->fDataSize; 1.1419 + fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1.1420 + int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1.1421 + fRXPat->fCompiledPat->setElementAt(op, topLoc); 1.1422 + 1.1423 + // Emit the SAVE_STATE 1.1424 + int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1.1425 + op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1.1426 + fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1.1427 + 1.1428 + // Emit the LD_SP 1.1429 + op = URX_BUILD(URX_LD_SP, stoLoc); 1.1430 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1431 + } 1.1432 + break; 1.1433 + 1.1434 + 1.1435 + case doBeginMatchMode: 1.1436 + fNewModeFlags = fModeFlags; 1.1437 + fSetModeFlag = TRUE; 1.1438 + break; 1.1439 + 1.1440 + case doMatchMode: // (?i) and similar 1.1441 + { 1.1442 + int32_t bit = 0; 1.1443 + switch (fC.fChar) { 1.1444 + case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1.1445 + case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1.1446 + case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1.1447 + case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1.1448 + case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1.1449 + case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1.1450 + case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1.1451 + case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1.1452 + default: 1.1453 + U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1.1454 + // by the scanner. 1.1455 + } 1.1456 + if (fSetModeFlag) { 1.1457 + fNewModeFlags |= bit; 1.1458 + } else { 1.1459 + fNewModeFlags &= ~bit; 1.1460 + } 1.1461 + } 1.1462 + break; 1.1463 + 1.1464 + case doSetMatchMode: 1.1465 + // Emit code to match any pending literals, using the not-yet changed match mode. 1.1466 + fixLiterals(); 1.1467 + 1.1468 + // We've got a (?i) or similar. The match mode is being changed, but 1.1469 + // the change is not scoped to a parenthesized block. 1.1470 + U_ASSERT(fNewModeFlags < 0); 1.1471 + fModeFlags = fNewModeFlags; 1.1472 + 1.1473 + break; 1.1474 + 1.1475 + 1.1476 + case doMatchModeParen: 1.1477 + // We've got a (?i: or similar. Begin a parenthesized block, save old 1.1478 + // mode flags so they can be restored at the close of the block. 1.1479 + // 1.1480 + // Compile to a 1.1481 + // - NOP, which later may be replaced by a save-state if the 1.1482 + // parenthesized group gets a * quantifier, followed by 1.1483 + // - NOP, which may later be replaced by a save-state if there 1.1484 + // is an '|' alternation within the parens. 1.1485 + { 1.1486 + fixLiterals(FALSE); 1.1487 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.1488 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1.1489 + 1.1490 + // On the Parentheses stack, start a new frame and add the postions 1.1491 + // of the two NOPs (a normal non-capturing () frame, except for the 1.1492 + // saving of the orignal mode flags.) 1.1493 + fParenStack.push(fModeFlags, *fStatus); 1.1494 + fParenStack.push(flags, *fStatus); // Frame Marker 1.1495 + fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1.1496 + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1.1497 + 1.1498 + // Set the current mode flags to the new values. 1.1499 + U_ASSERT(fNewModeFlags < 0); 1.1500 + fModeFlags = fNewModeFlags; 1.1501 + } 1.1502 + break; 1.1503 + 1.1504 + case doBadModeFlag: 1.1505 + error(U_REGEX_INVALID_FLAG); 1.1506 + break; 1.1507 + 1.1508 + case doSuppressComments: 1.1509 + // We have just scanned a '(?'. We now need to prevent the character scanner from 1.1510 + // treating a '#' as a to-the-end-of-line comment. 1.1511 + // (This Perl compatibility just gets uglier and uglier to do...) 1.1512 + fEOLComments = FALSE; 1.1513 + break; 1.1514 + 1.1515 + 1.1516 + case doSetAddAmp: 1.1517 + { 1.1518 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1519 + set->add(chAmp); 1.1520 + } 1.1521 + break; 1.1522 + 1.1523 + case doSetAddDash: 1.1524 + { 1.1525 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1526 + set->add(chDash); 1.1527 + } 1.1528 + break; 1.1529 + 1.1530 + case doSetBackslash_s: 1.1531 + { 1.1532 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1533 + set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1.1534 + break; 1.1535 + } 1.1536 + 1.1537 + case doSetBackslash_S: 1.1538 + { 1.1539 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1540 + UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1.1541 + SSet.complement(); 1.1542 + set->addAll(SSet); 1.1543 + break; 1.1544 + } 1.1545 + 1.1546 + case doSetBackslash_d: 1.1547 + { 1.1548 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1549 + // TODO - make a static set, ticket 6058. 1.1550 + addCategory(set, U_GC_ND_MASK, *fStatus); 1.1551 + break; 1.1552 + } 1.1553 + 1.1554 + case doSetBackslash_D: 1.1555 + { 1.1556 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1557 + UnicodeSet digits; 1.1558 + // TODO - make a static set, ticket 6058. 1.1559 + digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1.1560 + digits.complement(); 1.1561 + set->addAll(digits); 1.1562 + break; 1.1563 + } 1.1564 + 1.1565 + case doSetBackslash_w: 1.1566 + { 1.1567 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1568 + set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1.1569 + break; 1.1570 + } 1.1571 + 1.1572 + case doSetBackslash_W: 1.1573 + { 1.1574 + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1.1575 + UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1.1576 + SSet.complement(); 1.1577 + set->addAll(SSet); 1.1578 + break; 1.1579 + } 1.1580 + 1.1581 + case doSetBegin: 1.1582 + fixLiterals(FALSE); 1.1583 + fSetStack.push(new UnicodeSet(), *fStatus); 1.1584 + fSetOpStack.push(setStart, *fStatus); 1.1585 + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1.1586 + fSetOpStack.push(setCaseClose, *fStatus); 1.1587 + } 1.1588 + break; 1.1589 + 1.1590 + case doSetBeginDifference1: 1.1591 + // We have scanned something like [[abc]-[ 1.1592 + // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1.1593 + // Push a Difference operator, which will cause the new set to be subtracted from what 1.1594 + // went before once it is created. 1.1595 + setPushOp(setDifference1); 1.1596 + fSetOpStack.push(setStart, *fStatus); 1.1597 + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1.1598 + fSetOpStack.push(setCaseClose, *fStatus); 1.1599 + } 1.1600 + break; 1.1601 + 1.1602 + case doSetBeginIntersection1: 1.1603 + // We have scanned something like [[abc]&[ 1.1604 + // Need both the '&' operator and the open '[' operator. 1.1605 + setPushOp(setIntersection1); 1.1606 + fSetOpStack.push(setStart, *fStatus); 1.1607 + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1.1608 + fSetOpStack.push(setCaseClose, *fStatus); 1.1609 + } 1.1610 + break; 1.1611 + 1.1612 + case doSetBeginUnion: 1.1613 + // We have scanned something like [[abc][ 1.1614 + // Need to handle the union operation explicitly [[abc] | [ 1.1615 + setPushOp(setUnion); 1.1616 + fSetOpStack.push(setStart, *fStatus); 1.1617 + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1.1618 + fSetOpStack.push(setCaseClose, *fStatus); 1.1619 + } 1.1620 + break; 1.1621 + 1.1622 + case doSetDifference2: 1.1623 + // We have scanned something like [abc-- 1.1624 + // Consider this to unambiguously be a set difference operator. 1.1625 + setPushOp(setDifference2); 1.1626 + break; 1.1627 + 1.1628 + case doSetEnd: 1.1629 + // Have encountered the ']' that closes a set. 1.1630 + // Force the evaluation of any pending operations within this set, 1.1631 + // leave the completed set on the top of the set stack. 1.1632 + setEval(setEnd); 1.1633 + U_ASSERT(fSetOpStack.peeki()==setStart); 1.1634 + fSetOpStack.popi(); 1.1635 + break; 1.1636 + 1.1637 + case doSetFinish: 1.1638 + { 1.1639 + // Finished a complete set expression, including all nested sets. 1.1640 + // The close bracket has already triggered clearing out pending set operators, 1.1641 + // the operator stack should be empty and the operand stack should have just 1.1642 + // one entry, the result set. 1.1643 + U_ASSERT(fSetOpStack.empty()); 1.1644 + UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1.1645 + U_ASSERT(fSetStack.empty()); 1.1646 + compileSet(theSet); 1.1647 + break; 1.1648 + } 1.1649 + 1.1650 + case doSetIntersection2: 1.1651 + // Have scanned something like [abc&& 1.1652 + setPushOp(setIntersection2); 1.1653 + break; 1.1654 + 1.1655 + case doSetLiteral: 1.1656 + // Union the just-scanned literal character into the set being built. 1.1657 + // This operation is the highest precedence set operation, so we can always do 1.1658 + // it immediately, without waiting to see what follows. It is necessary to perform 1.1659 + // any pending '-' or '&' operation first, because these have the same precedence 1.1660 + // as union-ing in a literal' 1.1661 + { 1.1662 + setEval(setUnion); 1.1663 + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1.1664 + s->add(fC.fChar); 1.1665 + fLastSetLiteral = fC.fChar; 1.1666 + break; 1.1667 + } 1.1668 + 1.1669 + case doSetLiteralEscaped: 1.1670 + // A back-slash escaped literal character was encountered. 1.1671 + // Processing is the same as with setLiteral, above, with the addition of 1.1672 + // the optional check for errors on escaped ASCII letters. 1.1673 + { 1.1674 + if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1.1675 + ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1.1676 + (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1.1677 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.1678 + } 1.1679 + setEval(setUnion); 1.1680 + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1.1681 + s->add(fC.fChar); 1.1682 + fLastSetLiteral = fC.fChar; 1.1683 + break; 1.1684 + } 1.1685 + 1.1686 + case doSetNamedChar: 1.1687 + // Scanning a \N{UNICODE CHARACTER NAME} 1.1688 + // Aside from the source of the character, the processing is identical to doSetLiteral, 1.1689 + // above. 1.1690 + { 1.1691 + UChar32 c = scanNamedChar(); 1.1692 + setEval(setUnion); 1.1693 + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1.1694 + s->add(c); 1.1695 + fLastSetLiteral = c; 1.1696 + break; 1.1697 + } 1.1698 + 1.1699 + case doSetNamedRange: 1.1700 + // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1.1701 + // The left character is already in the set, and is saved in fLastSetLiteral. 1.1702 + // The right side needs to be picked up, the scan is at the 'N'. 1.1703 + // Lower Limit > Upper limit being an error matches both Java 1.1704 + // and ICU UnicodeSet behavior. 1.1705 + { 1.1706 + UChar32 c = scanNamedChar(); 1.1707 + if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { 1.1708 + error(U_REGEX_INVALID_RANGE); 1.1709 + } 1.1710 + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1.1711 + s->add(fLastSetLiteral, c); 1.1712 + fLastSetLiteral = c; 1.1713 + break; 1.1714 + } 1.1715 + 1.1716 + 1.1717 + case doSetNegate: 1.1718 + // Scanned a '^' at the start of a set. 1.1719 + // Push the negation operator onto the set op stack. 1.1720 + // A twist for case-insensitive matching: 1.1721 + // the case closure operation must happen _before_ negation. 1.1722 + // But the case closure operation will already be on the stack if it's required. 1.1723 + // This requires checking for case closure, and swapping the stack order 1.1724 + // if it is present. 1.1725 + { 1.1726 + int32_t tosOp = fSetOpStack.peeki(); 1.1727 + if (tosOp == setCaseClose) { 1.1728 + fSetOpStack.popi(); 1.1729 + fSetOpStack.push(setNegation, *fStatus); 1.1730 + fSetOpStack.push(setCaseClose, *fStatus); 1.1731 + } else { 1.1732 + fSetOpStack.push(setNegation, *fStatus); 1.1733 + } 1.1734 + } 1.1735 + break; 1.1736 + 1.1737 + case doSetNoCloseError: 1.1738 + error(U_REGEX_MISSING_CLOSE_BRACKET); 1.1739 + break; 1.1740 + 1.1741 + case doSetOpError: 1.1742 + error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1.1743 + break; 1.1744 + 1.1745 + case doSetPosixProp: 1.1746 + { 1.1747 + UnicodeSet *s = scanPosixProp(); 1.1748 + if (s != NULL) { 1.1749 + UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1.1750 + tos->addAll(*s); 1.1751 + delete s; 1.1752 + } // else error. scanProp() reported the error status already. 1.1753 + } 1.1754 + break; 1.1755 + 1.1756 + case doSetProp: 1.1757 + // Scanned a \p \P within [brackets]. 1.1758 + { 1.1759 + UnicodeSet *s = scanProp(); 1.1760 + if (s != NULL) { 1.1761 + UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1.1762 + tos->addAll(*s); 1.1763 + delete s; 1.1764 + } // else error. scanProp() reported the error status already. 1.1765 + } 1.1766 + break; 1.1767 + 1.1768 + 1.1769 + case doSetRange: 1.1770 + // We have scanned literal-literal. Add the range to the set. 1.1771 + // The left character is already in the set, and is saved in fLastSetLiteral. 1.1772 + // The right side is the current character. 1.1773 + // Lower Limit > Upper limit being an error matches both Java 1.1774 + // and ICU UnicodeSet behavior. 1.1775 + { 1.1776 + if (fLastSetLiteral > fC.fChar) { 1.1777 + error(U_REGEX_INVALID_RANGE); 1.1778 + } 1.1779 + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1.1780 + s->add(fLastSetLiteral, fC.fChar); 1.1781 + break; 1.1782 + } 1.1783 + 1.1784 + default: 1.1785 + U_ASSERT(FALSE); 1.1786 + error(U_REGEX_INTERNAL_ERROR); 1.1787 + break; 1.1788 + } 1.1789 + 1.1790 + if (U_FAILURE(*fStatus)) { 1.1791 + returnVal = FALSE; 1.1792 + } 1.1793 + 1.1794 + return returnVal; 1.1795 +} 1.1796 + 1.1797 + 1.1798 + 1.1799 +//------------------------------------------------------------------------------ 1.1800 +// 1.1801 +// literalChar We've encountered a literal character from the pattern, 1.1802 +// or an escape sequence that reduces to a character. 1.1803 +// Add it to the string containing all literal chars/strings from 1.1804 +// the pattern. 1.1805 +// 1.1806 +//------------------------------------------------------------------------------ 1.1807 +void RegexCompile::literalChar(UChar32 c) { 1.1808 + fLiteralChars.append(c); 1.1809 +} 1.1810 + 1.1811 + 1.1812 +//------------------------------------------------------------------------------ 1.1813 +// 1.1814 +// fixLiterals When compiling something that can follow a literal 1.1815 +// string in a pattern, emit the code to match the 1.1816 +// accumulated literal string. 1.1817 +// 1.1818 +// Optionally, split the last char of the string off into 1.1819 +// a single "ONE_CHAR" operation, so that quantifiers can 1.1820 +// apply to that char alone. Example: abc* 1.1821 +// The * must apply to the 'c' only. 1.1822 +// 1.1823 +//------------------------------------------------------------------------------ 1.1824 +void RegexCompile::fixLiterals(UBool split) { 1.1825 + int32_t op = 0; // An op from/for the compiled pattern. 1.1826 + 1.1827 + // If no literal characters have been scanned but not yet had code generated 1.1828 + // for them, nothing needs to be done. 1.1829 + if (fLiteralChars.length() == 0) { 1.1830 + return; 1.1831 + } 1.1832 + 1.1833 + int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1.1834 + UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1.1835 + 1.1836 + // Split: We need to ensure that the last item in the compiled pattern 1.1837 + // refers only to the last literal scanned in the pattern, so that 1.1838 + // quantifiers (*, +, etc.) affect only it, and not a longer string. 1.1839 + // Split before case folding for case insensitive matches. 1.1840 + 1.1841 + if (split) { 1.1842 + fLiteralChars.truncate(indexOfLastCodePoint); 1.1843 + fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string. 1.1844 + // Note that the truncated literal string may be empty, in which case 1.1845 + // nothing will be emitted. 1.1846 + 1.1847 + literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1.1848 + fixLiterals(FALSE); // Second recursive call, code for the final code point. 1.1849 + return; 1.1850 + } 1.1851 + 1.1852 + // If we are doing case-insensitive matching, case fold the string. This may expand 1.1853 + // the string, e.g. the German sharp-s turns into "ss" 1.1854 + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1.1855 + fLiteralChars.foldCase(); 1.1856 + indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1.1857 + lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1.1858 + } 1.1859 + 1.1860 + if (indexOfLastCodePoint == 0) { 1.1861 + // Single character, emit a URX_ONECHAR op to match it. 1.1862 + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1.1863 + u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 1.1864 + op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); 1.1865 + } else { 1.1866 + op = URX_BUILD(URX_ONECHAR, lastCodePoint); 1.1867 + } 1.1868 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1869 + } else { 1.1870 + // Two or more chars, emit a URX_STRING to match them. 1.1871 + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1.1872 + op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); 1.1873 + } else { 1.1874 + // TODO here: add optimization to split case sensitive strings of length two 1.1875 + // into two single char ops, for efficiency. 1.1876 + op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); 1.1877 + } 1.1878 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1879 + op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); 1.1880 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.1881 + 1.1882 + // Add this string into the accumulated strings of the compiled pattern. 1.1883 + fRXPat->fLiteralText.append(fLiteralChars); 1.1884 + } 1.1885 + 1.1886 + fLiteralChars.remove(); 1.1887 +} 1.1888 + 1.1889 + 1.1890 + 1.1891 + 1.1892 + 1.1893 + 1.1894 +//------------------------------------------------------------------------------ 1.1895 +// 1.1896 +// insertOp() Insert a slot for a new opcode into the already 1.1897 +// compiled pattern code. 1.1898 +// 1.1899 +// Fill the slot with a NOP. Our caller will replace it 1.1900 +// with what they really wanted. 1.1901 +// 1.1902 +//------------------------------------------------------------------------------ 1.1903 +void RegexCompile::insertOp(int32_t where) { 1.1904 + UVector64 *code = fRXPat->fCompiledPat; 1.1905 + U_ASSERT(where>0 && where < code->size()); 1.1906 + 1.1907 + int32_t nop = URX_BUILD(URX_NOP, 0); 1.1908 + code->insertElementAt(nop, where, *fStatus); 1.1909 + 1.1910 + // Walk through the pattern, looking for any ops with targets that 1.1911 + // were moved down by the insert. Fix them. 1.1912 + int32_t loc; 1.1913 + for (loc=0; loc<code->size(); loc++) { 1.1914 + int32_t op = (int32_t)code->elementAti(loc); 1.1915 + int32_t opType = URX_TYPE(op); 1.1916 + int32_t opValue = URX_VAL(op); 1.1917 + if ((opType == URX_JMP || 1.1918 + opType == URX_JMPX || 1.1919 + opType == URX_STATE_SAVE || 1.1920 + opType == URX_CTR_LOOP || 1.1921 + opType == URX_CTR_LOOP_NG || 1.1922 + opType == URX_JMP_SAV || 1.1923 + opType == URX_JMP_SAV_X || 1.1924 + opType == URX_RELOC_OPRND) && opValue > where) { 1.1925 + // Target location for this opcode is after the insertion point and 1.1926 + // needs to be incremented to adjust for the insertion. 1.1927 + opValue++; 1.1928 + op = URX_BUILD(opType, opValue); 1.1929 + code->setElementAt(op, loc); 1.1930 + } 1.1931 + } 1.1932 + 1.1933 + // Now fix up the parentheses stack. All positive values in it are locations in 1.1934 + // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 1.1935 + for (loc=0; loc<fParenStack.size(); loc++) { 1.1936 + int32_t x = fParenStack.elementAti(loc); 1.1937 + U_ASSERT(x < code->size()); 1.1938 + if (x>where) { 1.1939 + x++; 1.1940 + fParenStack.setElementAt(x, loc); 1.1941 + } 1.1942 + } 1.1943 + 1.1944 + if (fMatchCloseParen > where) { 1.1945 + fMatchCloseParen++; 1.1946 + } 1.1947 + if (fMatchOpenParen > where) { 1.1948 + fMatchOpenParen++; 1.1949 + } 1.1950 +} 1.1951 + 1.1952 + 1.1953 + 1.1954 +//------------------------------------------------------------------------------ 1.1955 +// 1.1956 +// blockTopLoc() Find or create a location in the compiled pattern 1.1957 +// at the start of the operation or block that has 1.1958 +// just been compiled. Needed when a quantifier (* or 1.1959 +// whatever) appears, and we need to add an operation 1.1960 +// at the start of the thing being quantified. 1.1961 +// 1.1962 +// (Parenthesized Blocks) have a slot with a NOP that 1.1963 +// is reserved for this purpose. .* or similar don't 1.1964 +// and a slot needs to be added. 1.1965 +// 1.1966 +// parameter reserveLoc : TRUE - ensure that there is space to add an opcode 1.1967 +// at the returned location. 1.1968 +// FALSE - just return the address, 1.1969 +// do not reserve a location there. 1.1970 +// 1.1971 +//------------------------------------------------------------------------------ 1.1972 +int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 1.1973 + int32_t theLoc; 1.1974 + fixLiterals(TRUE); // Emit code for any pending literals. 1.1975 + // If last item was a string, emit separate op for the its last char. 1.1976 + if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 1.1977 + { 1.1978 + // The item just processed is a parenthesized block. 1.1979 + theLoc = fMatchOpenParen; // A slot is already reserved for us. 1.1980 + U_ASSERT(theLoc > 0); 1.1981 + U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 1.1982 + } 1.1983 + else { 1.1984 + // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. 1.1985 + // No slot for STATE_SAVE was pre-reserved in the compiled code. 1.1986 + // We need to make space now. 1.1987 + theLoc = fRXPat->fCompiledPat->size()-1; 1.1988 + int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); 1.1989 + if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { 1.1990 + // Strings take two opcode, we want the position of the first one. 1.1991 + // We can have a string at this point if a single character case-folded to two. 1.1992 + theLoc--; 1.1993 + } 1.1994 + if (reserveLoc) { 1.1995 + int32_t nop = URX_BUILD(URX_NOP, 0); 1.1996 + fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 1.1997 + } 1.1998 + } 1.1999 + return theLoc; 1.2000 +} 1.2001 + 1.2002 + 1.2003 + 1.2004 +//------------------------------------------------------------------------------ 1.2005 +// 1.2006 +// handleCloseParen When compiling a close paren, we need to go back 1.2007 +// and fix up any JMP or SAVE operations within the 1.2008 +// parenthesized block that need to target the end 1.2009 +// of the block. The locations of these are kept on 1.2010 +// the paretheses stack. 1.2011 +// 1.2012 +// This function is called both when encountering a 1.2013 +// real ) and at the end of the pattern. 1.2014 +// 1.2015 +//------------------------------------------------------------------------------ 1.2016 +void RegexCompile::handleCloseParen() { 1.2017 + int32_t patIdx; 1.2018 + int32_t patOp; 1.2019 + if (fParenStack.size() <= 0) { 1.2020 + error(U_REGEX_MISMATCHED_PAREN); 1.2021 + return; 1.2022 + } 1.2023 + 1.2024 + // Emit code for any pending literals. 1.2025 + fixLiterals(FALSE); 1.2026 + 1.2027 + // Fixup any operations within the just-closed parenthesized group 1.2028 + // that need to reference the end of the (block). 1.2029 + // (The first one popped from the stack is an unused slot for 1.2030 + // alternation (OR) state save, but applying the fixup to it does no harm.) 1.2031 + for (;;) { 1.2032 + patIdx = fParenStack.popi(); 1.2033 + if (patIdx < 0) { 1.2034 + // value < 0 flags the start of the frame on the paren stack. 1.2035 + break; 1.2036 + } 1.2037 + U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 1.2038 + patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 1.2039 + U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 1.2040 + patOp |= fRXPat->fCompiledPat->size(); // Set it now. 1.2041 + fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 1.2042 + fMatchOpenParen = patIdx; 1.2043 + } 1.2044 + 1.2045 + // At the close of any parenthesized block, restore the match mode flags to 1.2046 + // the value they had at the open paren. Saved value is 1.2047 + // at the top of the paren stack. 1.2048 + fModeFlags = fParenStack.popi(); 1.2049 + U_ASSERT(fModeFlags < 0); 1.2050 + 1.2051 + // DO any additional fixups, depending on the specific kind of 1.2052 + // parentesized grouping this is 1.2053 + 1.2054 + switch (patIdx) { 1.2055 + case plain: 1.2056 + case flags: 1.2057 + // No additional fixups required. 1.2058 + // (Grouping-only parentheses) 1.2059 + break; 1.2060 + case capturing: 1.2061 + // Capturing Parentheses. 1.2062 + // Insert a End Capture op into the pattern. 1.2063 + // The frame offset of the variables for this cg is obtained from the 1.2064 + // start capture op and put it into the end-capture op. 1.2065 + { 1.2066 + int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 1.2067 + U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 1.2068 + 1.2069 + int32_t frameVarLocation = URX_VAL(captureOp); 1.2070 + int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); 1.2071 + fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); 1.2072 + } 1.2073 + break; 1.2074 + case atomic: 1.2075 + // Atomic Parenthesis. 1.2076 + // Insert a LD_SP operation to restore the state stack to the position 1.2077 + // it was when the atomic parens were entered. 1.2078 + { 1.2079 + int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 1.2080 + U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 1.2081 + int32_t stoLoc = URX_VAL(stoOp); 1.2082 + int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 1.2083 + fRXPat->fCompiledPat->addElement(ldOp, *fStatus); 1.2084 + } 1.2085 + break; 1.2086 + 1.2087 + case lookAhead: 1.2088 + { 1.2089 + int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 1.2090 + U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 1.2091 + int32_t dataLoc = URX_VAL(startOp); 1.2092 + int32_t op = URX_BUILD(URX_LA_END, dataLoc); 1.2093 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2094 + } 1.2095 + break; 1.2096 + 1.2097 + case negLookAhead: 1.2098 + { 1.2099 + // See comment at doOpenLookAheadNeg 1.2100 + int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 1.2101 + U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 1.2102 + int32_t dataLoc = URX_VAL(startOp); 1.2103 + int32_t op = URX_BUILD(URX_LA_END, dataLoc); 1.2104 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2105 + op = URX_BUILD(URX_BACKTRACK, 0); 1.2106 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2107 + op = URX_BUILD(URX_LA_END, dataLoc); 1.2108 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2109 + 1.2110 + // Patch the URX_SAVE near the top of the block. 1.2111 + // The destination of the SAVE is the final LA_END that was just added. 1.2112 + int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 1.2113 + U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 1.2114 + int32_t dest = fRXPat->fCompiledPat->size()-1; 1.2115 + saveOp = URX_BUILD(URX_STATE_SAVE, dest); 1.2116 + fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 1.2117 + } 1.2118 + break; 1.2119 + 1.2120 + case lookBehind: 1.2121 + { 1.2122 + // See comment at doOpenLookBehind. 1.2123 + 1.2124 + // Append the URX_LB_END and URX_LA_END to the compiled pattern. 1.2125 + int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 1.2126 + U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 1.2127 + int32_t dataLoc = URX_VAL(startOp); 1.2128 + int32_t op = URX_BUILD(URX_LB_END, dataLoc); 1.2129 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2130 + op = URX_BUILD(URX_LA_END, dataLoc); 1.2131 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2132 + 1.2133 + // Determine the min and max bounds for the length of the 1.2134 + // string that the pattern can match. 1.2135 + // An unbounded upper limit is an error. 1.2136 + int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 1.2137 + int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 1.2138 + int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 1.2139 + if (maxML == INT32_MAX) { 1.2140 + error(U_REGEX_LOOK_BEHIND_LIMIT); 1.2141 + break; 1.2142 + } 1.2143 + U_ASSERT(minML <= maxML); 1.2144 + 1.2145 + // Insert the min and max match len bounds into the URX_LB_CONT op that 1.2146 + // appears at the top of the look-behind block, at location fMatchOpenParen+1 1.2147 + fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 1.2148 + fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 1.2149 + 1.2150 + } 1.2151 + break; 1.2152 + 1.2153 + 1.2154 + 1.2155 + case lookBehindN: 1.2156 + { 1.2157 + // See comment at doOpenLookBehindNeg. 1.2158 + 1.2159 + // Append the URX_LBN_END to the compiled pattern. 1.2160 + int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 1.2161 + U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 1.2162 + int32_t dataLoc = URX_VAL(startOp); 1.2163 + int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 1.2164 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2165 + 1.2166 + // Determine the min and max bounds for the length of the 1.2167 + // string that the pattern can match. 1.2168 + // An unbounded upper limit is an error. 1.2169 + int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 1.2170 + int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 1.2171 + int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 1.2172 + if (maxML == INT32_MAX) { 1.2173 + error(U_REGEX_LOOK_BEHIND_LIMIT); 1.2174 + break; 1.2175 + } 1.2176 + U_ASSERT(minML <= maxML); 1.2177 + 1.2178 + // Insert the min and max match len bounds into the URX_LB_CONT op that 1.2179 + // appears at the top of the look-behind block, at location fMatchOpenParen+1 1.2180 + fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 1.2181 + fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 1.2182 + 1.2183 + // Insert the pattern location to continue at after a successful match 1.2184 + // as the last operand of the URX_LBN_CONT 1.2185 + op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 1.2186 + fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 1.2187 + } 1.2188 + break; 1.2189 + 1.2190 + 1.2191 + 1.2192 + default: 1.2193 + U_ASSERT(FALSE); 1.2194 + } 1.2195 + 1.2196 + // remember the next location in the compiled pattern. 1.2197 + // The compilation of Quantifiers will look at this to see whether its looping 1.2198 + // over a parenthesized block or a single item 1.2199 + fMatchCloseParen = fRXPat->fCompiledPat->size(); 1.2200 +} 1.2201 + 1.2202 + 1.2203 + 1.2204 +//------------------------------------------------------------------------------ 1.2205 +// 1.2206 +// compileSet Compile the pattern operations for a reference to a 1.2207 +// UnicodeSet. 1.2208 +// 1.2209 +//------------------------------------------------------------------------------ 1.2210 +void RegexCompile::compileSet(UnicodeSet *theSet) 1.2211 +{ 1.2212 + if (theSet == NULL) { 1.2213 + return; 1.2214 + } 1.2215 + // Remove any strings from the set. 1.2216 + // There shoudn't be any, but just in case. 1.2217 + // (Case Closure can add them; if we had a simple case closure avaialble that 1.2218 + // ignored strings, that would be better.) 1.2219 + theSet->removeAllStrings(); 1.2220 + int32_t setSize = theSet->size(); 1.2221 + 1.2222 + switch (setSize) { 1.2223 + case 0: 1.2224 + { 1.2225 + // Set of no elements. Always fails to match. 1.2226 + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); 1.2227 + delete theSet; 1.2228 + } 1.2229 + break; 1.2230 + 1.2231 + case 1: 1.2232 + { 1.2233 + // The set contains only a single code point. Put it into 1.2234 + // the compiled pattern as a single char operation rather 1.2235 + // than a set, and discard the set itself. 1.2236 + literalChar(theSet->charAt(0)); 1.2237 + delete theSet; 1.2238 + } 1.2239 + break; 1.2240 + 1.2241 + default: 1.2242 + { 1.2243 + // The set contains two or more chars. (the normal case) 1.2244 + // Put it into the compiled pattern as a set. 1.2245 + int32_t setNumber = fRXPat->fSets->size(); 1.2246 + fRXPat->fSets->addElement(theSet, *fStatus); 1.2247 + int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 1.2248 + fRXPat->fCompiledPat->addElement(setOp, *fStatus); 1.2249 + } 1.2250 + } 1.2251 +} 1.2252 + 1.2253 + 1.2254 +//------------------------------------------------------------------------------ 1.2255 +// 1.2256 +// compileInterval Generate the code for a {min, max} style interval quantifier. 1.2257 +// Except for the specific opcodes used, the code is the same 1.2258 +// for all three types (greedy, non-greedy, possessive) of 1.2259 +// intervals. The opcodes are supplied as parameters. 1.2260 +// (There are two sets of opcodes - greedy & possessive use the 1.2261 +// same ones, while non-greedy has it's own.) 1.2262 +// 1.2263 +// The code for interval loops has this form: 1.2264 +// 0 CTR_INIT counter loc (in stack frame) 1.2265 +// 1 5 patt address of CTR_LOOP at bottom of block 1.2266 +// 2 min count 1.2267 +// 3 max count (-1 for unbounded) 1.2268 +// 4 ... block to be iterated over 1.2269 +// 5 CTR_LOOP 1.2270 +// 1.2271 +// In 1.2272 +//------------------------------------------------------------------------------ 1.2273 +void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 1.2274 +{ 1.2275 + // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 1.2276 + // four slots in the compiled code. Reserve them. 1.2277 + int32_t topOfBlock = blockTopLoc(TRUE); 1.2278 + insertOp(topOfBlock); 1.2279 + insertOp(topOfBlock); 1.2280 + insertOp(topOfBlock); 1.2281 + 1.2282 + // The operands for the CTR_INIT opcode include the index in the matcher data 1.2283 + // of the counter. Allocate it now. There are two data items 1.2284 + // counterLoc --> Loop counter 1.2285 + // +1 --> Input index (for breaking non-progressing loops) 1.2286 + // (Only present if unbounded upper limit on loop) 1.2287 + int32_t counterLoc = fRXPat->fFrameSize; 1.2288 + fRXPat->fFrameSize++; 1.2289 + if (fIntervalUpper < 0) { 1.2290 + fRXPat->fFrameSize++; 1.2291 + } 1.2292 + 1.2293 + int32_t op = URX_BUILD(InitOp, counterLoc); 1.2294 + fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 1.2295 + 1.2296 + // The second operand of CTR_INIT is the location following the end of the loop. 1.2297 + // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 1.2298 + // compilation of something later on causes the code to grow and the target 1.2299 + // position to move. 1.2300 + int32_t loopEnd = fRXPat->fCompiledPat->size(); 1.2301 + op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 1.2302 + fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 1.2303 + 1.2304 + // Followed by the min and max counts. 1.2305 + fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 1.2306 + fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 1.2307 + 1.2308 + // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 1.2309 + // Goes at end of the block being looped over, so just append to the code so far. 1.2310 + op = URX_BUILD(LoopOp, topOfBlock); 1.2311 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2312 + 1.2313 + if ((fIntervalLow & 0xff000000) != 0 || 1.2314 + (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 1.2315 + error(U_REGEX_NUMBER_TOO_BIG); 1.2316 + } 1.2317 + 1.2318 + if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 1.2319 + error(U_REGEX_MAX_LT_MIN); 1.2320 + } 1.2321 +} 1.2322 + 1.2323 + 1.2324 + 1.2325 +UBool RegexCompile::compileInlineInterval() { 1.2326 + if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 1.2327 + // Too big to inline. Fail, which will cause looping code to be generated. 1.2328 + // (Upper < Lower picks up unbounded upper and errors, both.) 1.2329 + return FALSE; 1.2330 + } 1.2331 + 1.2332 + int32_t topOfBlock = blockTopLoc(FALSE); 1.2333 + if (fIntervalUpper == 0) { 1.2334 + // Pathological case. Attempt no matches, as if the block doesn't exist. 1.2335 + fRXPat->fCompiledPat->setSize(topOfBlock); 1.2336 + return TRUE; 1.2337 + } 1.2338 + 1.2339 + if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 1.2340 + // The thing being repeated is not a single op, but some 1.2341 + // more complex block. Do it as a loop, not inlines. 1.2342 + // Note that things "repeated" a max of once are handled as inline, because 1.2343 + // the one copy of the code already generated is just fine. 1.2344 + return FALSE; 1.2345 + } 1.2346 + 1.2347 + // Pick up the opcode that is to be repeated 1.2348 + // 1.2349 + int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 1.2350 + 1.2351 + // Compute the pattern location where the inline sequence 1.2352 + // will end, and set up the state save op that will be needed. 1.2353 + // 1.2354 + int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 1.2355 + + fIntervalUpper + (fIntervalUpper-fIntervalLow); 1.2356 + int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); 1.2357 + if (fIntervalLow == 0) { 1.2358 + insertOp(topOfBlock); 1.2359 + fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 1.2360 + } 1.2361 + 1.2362 + 1.2363 + 1.2364 + // Loop, emitting the op for the thing being repeated each time. 1.2365 + // Loop starts at 1 because one instance of the op already exists in the pattern, 1.2366 + // it was put there when it was originally encountered. 1.2367 + int32_t i; 1.2368 + for (i=1; i<fIntervalUpper; i++ ) { 1.2369 + if (i == fIntervalLow) { 1.2370 + fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 1.2371 + } 1.2372 + if (i > fIntervalLow) { 1.2373 + fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 1.2374 + } 1.2375 + fRXPat->fCompiledPat->addElement(op, *fStatus); 1.2376 + } 1.2377 + return TRUE; 1.2378 +} 1.2379 + 1.2380 + 1.2381 + 1.2382 +//------------------------------------------------------------------------------ 1.2383 +// 1.2384 +// matchStartType Determine how a match can start. 1.2385 +// Used to optimize find() operations. 1.2386 +// 1.2387 +// Operation is very similar to minMatchLength(). Walk the compiled 1.2388 +// pattern, keeping an on-going minimum-match-length. For any 1.2389 +// op where the min match coming in is zero, add that ops possible 1.2390 +// starting matches to the possible starts for the overall pattern. 1.2391 +// 1.2392 +//------------------------------------------------------------------------------ 1.2393 +void RegexCompile::matchStartType() { 1.2394 + if (U_FAILURE(*fStatus)) { 1.2395 + return; 1.2396 + } 1.2397 + 1.2398 + 1.2399 + int32_t loc; // Location in the pattern of the current op being processed. 1.2400 + int32_t op; // The op being processed 1.2401 + int32_t opType; // The opcode type of the op 1.2402 + int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 1.2403 + int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 1.2404 + 1.2405 + UBool atStart = TRUE; // True if no part of the pattern yet encountered 1.2406 + // could have advanced the position in a match. 1.2407 + // (Maximum match length so far == 0) 1.2408 + 1.2409 + // forwardedLength is a vector holding minimum-match-length values that 1.2410 + // are propagated forward in the pattern by JMP or STATE_SAVE operations. 1.2411 + // It must be one longer than the pattern being checked because some ops 1.2412 + // will jmp to a end-of-block+1 location from within a block, and we must 1.2413 + // count those when checking the block. 1.2414 + int32_t end = fRXPat->fCompiledPat->size(); 1.2415 + UVector32 forwardedLength(end+1, *fStatus); 1.2416 + forwardedLength.setSize(end+1); 1.2417 + for (loc=3; loc<end; loc++) { 1.2418 + forwardedLength.setElementAt(INT32_MAX, loc); 1.2419 + } 1.2420 + 1.2421 + for (loc = 3; loc<end; loc++) { 1.2422 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.2423 + opType = URX_TYPE(op); 1.2424 + 1.2425 + // The loop is advancing linearly through the pattern. 1.2426 + // If the op we are now at was the destination of a branch in the pattern, 1.2427 + // and that path has a shorter minimum length than the current accumulated value, 1.2428 + // replace the current accumulated value. 1.2429 + if (forwardedLength.elementAti(loc) < currentLen) { 1.2430 + currentLen = forwardedLength.elementAti(loc); 1.2431 + U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 1.2432 + } 1.2433 + 1.2434 + switch (opType) { 1.2435 + // Ops that don't change the total length matched 1.2436 + case URX_RESERVED_OP: 1.2437 + case URX_END: 1.2438 + case URX_FAIL: 1.2439 + case URX_STRING_LEN: 1.2440 + case URX_NOP: 1.2441 + case URX_START_CAPTURE: 1.2442 + case URX_END_CAPTURE: 1.2443 + case URX_BACKSLASH_B: 1.2444 + case URX_BACKSLASH_BU: 1.2445 + case URX_BACKSLASH_G: 1.2446 + case URX_BACKSLASH_Z: 1.2447 + case URX_DOLLAR: 1.2448 + case URX_DOLLAR_M: 1.2449 + case URX_DOLLAR_D: 1.2450 + case URX_DOLLAR_MD: 1.2451 + case URX_RELOC_OPRND: 1.2452 + case URX_STO_INP_LOC: 1.2453 + case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 1.2454 + case URX_BACKREF_I: 1.2455 + 1.2456 + case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 1.2457 + case URX_LD_SP: 1.2458 + break; 1.2459 + 1.2460 + case URX_CARET: 1.2461 + if (atStart) { 1.2462 + fRXPat->fStartType = START_START; 1.2463 + } 1.2464 + break; 1.2465 + 1.2466 + case URX_CARET_M: 1.2467 + case URX_CARET_M_UNIX: 1.2468 + if (atStart) { 1.2469 + fRXPat->fStartType = START_LINE; 1.2470 + } 1.2471 + break; 1.2472 + 1.2473 + case URX_ONECHAR: 1.2474 + if (currentLen == 0) { 1.2475 + // This character could appear at the start of a match. 1.2476 + // Add it to the set of possible starting characters. 1.2477 + fRXPat->fInitialChars->add(URX_VAL(op)); 1.2478 + numInitialStrings += 2; 1.2479 + } 1.2480 + currentLen++; 1.2481 + atStart = FALSE; 1.2482 + break; 1.2483 + 1.2484 + 1.2485 + case URX_SETREF: 1.2486 + if (currentLen == 0) { 1.2487 + int32_t sn = URX_VAL(op); 1.2488 + U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 1.2489 + const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 1.2490 + fRXPat->fInitialChars->addAll(*s); 1.2491 + numInitialStrings += 2; 1.2492 + } 1.2493 + currentLen++; 1.2494 + atStart = FALSE; 1.2495 + break; 1.2496 + 1.2497 + case URX_LOOP_SR_I: 1.2498 + // [Set]*, like a SETREF, above, in what it can match, 1.2499 + // but may not match at all, so currentLen is not incremented. 1.2500 + if (currentLen == 0) { 1.2501 + int32_t sn = URX_VAL(op); 1.2502 + U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 1.2503 + const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 1.2504 + fRXPat->fInitialChars->addAll(*s); 1.2505 + numInitialStrings += 2; 1.2506 + } 1.2507 + atStart = FALSE; 1.2508 + break; 1.2509 + 1.2510 + case URX_LOOP_DOT_I: 1.2511 + if (currentLen == 0) { 1.2512 + // .* at the start of a pattern. 1.2513 + // Any character can begin the match. 1.2514 + fRXPat->fInitialChars->clear(); 1.2515 + fRXPat->fInitialChars->complement(); 1.2516 + numInitialStrings += 2; 1.2517 + } 1.2518 + atStart = FALSE; 1.2519 + break; 1.2520 + 1.2521 + 1.2522 + case URX_STATIC_SETREF: 1.2523 + if (currentLen == 0) { 1.2524 + int32_t sn = URX_VAL(op); 1.2525 + U_ASSERT(sn>0 && sn<URX_LAST_SET); 1.2526 + const UnicodeSet *s = fRXPat->fStaticSets[sn]; 1.2527 + fRXPat->fInitialChars->addAll(*s); 1.2528 + numInitialStrings += 2; 1.2529 + } 1.2530 + currentLen++; 1.2531 + atStart = FALSE; 1.2532 + break; 1.2533 + 1.2534 + 1.2535 + 1.2536 + case URX_STAT_SETREF_N: 1.2537 + if (currentLen == 0) { 1.2538 + int32_t sn = URX_VAL(op); 1.2539 + const UnicodeSet *s = fRXPat->fStaticSets[sn]; 1.2540 + UnicodeSet sc(*s); 1.2541 + sc.complement(); 1.2542 + fRXPat->fInitialChars->addAll(sc); 1.2543 + numInitialStrings += 2; 1.2544 + } 1.2545 + currentLen++; 1.2546 + atStart = FALSE; 1.2547 + break; 1.2548 + 1.2549 + 1.2550 + 1.2551 + case URX_BACKSLASH_D: 1.2552 + // Digit Char 1.2553 + if (currentLen == 0) { 1.2554 + UnicodeSet s; 1.2555 + s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1.2556 + if (URX_VAL(op) != 0) { 1.2557 + s.complement(); 1.2558 + } 1.2559 + fRXPat->fInitialChars->addAll(s); 1.2560 + numInitialStrings += 2; 1.2561 + } 1.2562 + currentLen++; 1.2563 + atStart = FALSE; 1.2564 + break; 1.2565 + 1.2566 + 1.2567 + case URX_ONECHAR_I: 1.2568 + // Case Insensitive Single Character. 1.2569 + if (currentLen == 0) { 1.2570 + UChar32 c = URX_VAL(op); 1.2571 + if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 1.2572 + 1.2573 + // Disable optimizations on first char of match. 1.2574 + // TODO: Compute the set of chars that case fold to this char, or to 1.2575 + // a string that begins with this char. 1.2576 + // For simple case folding, this code worked: 1.2577 + // UnicodeSet s(c, c); 1.2578 + // s.closeOver(USET_CASE_INSENSITIVE); 1.2579 + // fRXPat->fInitialChars->addAll(s); 1.2580 + 1.2581 + fRXPat->fInitialChars->clear(); 1.2582 + fRXPat->fInitialChars->complement(); 1.2583 + } else { 1.2584 + // Char has no case variants. Just add it as-is to the 1.2585 + // set of possible starting chars. 1.2586 + fRXPat->fInitialChars->add(c); 1.2587 + } 1.2588 + numInitialStrings += 2; 1.2589 + } 1.2590 + currentLen++; 1.2591 + atStart = FALSE; 1.2592 + break; 1.2593 + 1.2594 + 1.2595 + case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 1.2596 + case URX_DOTANY_ALL: // . matches one or two. 1.2597 + case URX_DOTANY: 1.2598 + case URX_DOTANY_UNIX: 1.2599 + if (currentLen == 0) { 1.2600 + // These constructs are all bad news when they appear at the start 1.2601 + // of a match. Any character can begin the match. 1.2602 + fRXPat->fInitialChars->clear(); 1.2603 + fRXPat->fInitialChars->complement(); 1.2604 + numInitialStrings += 2; 1.2605 + } 1.2606 + currentLen++; 1.2607 + atStart = FALSE; 1.2608 + break; 1.2609 + 1.2610 + 1.2611 + case URX_JMPX: 1.2612 + loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 1.2613 + case URX_JMP: 1.2614 + { 1.2615 + int32_t jmpDest = URX_VAL(op); 1.2616 + if (jmpDest < loc) { 1.2617 + // Loop of some kind. Can safely ignore, the worst that will happen 1.2618 + // is that we understate the true minimum length 1.2619 + currentLen = forwardedLength.elementAti(loc+1); 1.2620 + 1.2621 + } else { 1.2622 + // Forward jump. Propagate the current min length to the target loc of the jump. 1.2623 + U_ASSERT(jmpDest <= end+1); 1.2624 + if (forwardedLength.elementAti(jmpDest) > currentLen) { 1.2625 + forwardedLength.setElementAt(currentLen, jmpDest); 1.2626 + } 1.2627 + } 1.2628 + } 1.2629 + atStart = FALSE; 1.2630 + break; 1.2631 + 1.2632 + case URX_JMP_SAV: 1.2633 + case URX_JMP_SAV_X: 1.2634 + // Combo of state save to the next loc, + jmp backwards. 1.2635 + // Net effect on min. length computation is nothing. 1.2636 + atStart = FALSE; 1.2637 + break; 1.2638 + 1.2639 + case URX_BACKTRACK: 1.2640 + // Fails are kind of like a branch, except that the min length was 1.2641 + // propagated already, by the state save. 1.2642 + currentLen = forwardedLength.elementAti(loc+1); 1.2643 + atStart = FALSE; 1.2644 + break; 1.2645 + 1.2646 + 1.2647 + case URX_STATE_SAVE: 1.2648 + { 1.2649 + // State Save, for forward jumps, propagate the current minimum. 1.2650 + // of the state save. 1.2651 + int32_t jmpDest = URX_VAL(op); 1.2652 + if (jmpDest > loc) { 1.2653 + if (currentLen < forwardedLength.elementAti(jmpDest)) { 1.2654 + forwardedLength.setElementAt(currentLen, jmpDest); 1.2655 + } 1.2656 + } 1.2657 + } 1.2658 + atStart = FALSE; 1.2659 + break; 1.2660 + 1.2661 + 1.2662 + 1.2663 + 1.2664 + case URX_STRING: 1.2665 + { 1.2666 + loc++; 1.2667 + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.2668 + int32_t stringLen = URX_VAL(stringLenOp); 1.2669 + U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 1.2670 + U_ASSERT(stringLenOp >= 2); 1.2671 + if (currentLen == 0) { 1.2672 + // Add the starting character of this string to the set of possible starting 1.2673 + // characters for this pattern. 1.2674 + int32_t stringStartIdx = URX_VAL(op); 1.2675 + UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 1.2676 + fRXPat->fInitialChars->add(c); 1.2677 + 1.2678 + // Remember this string. After the entire pattern has been checked, 1.2679 + // if nothing else is identified that can start a match, we'll use it. 1.2680 + numInitialStrings++; 1.2681 + fRXPat->fInitialStringIdx = stringStartIdx; 1.2682 + fRXPat->fInitialStringLen = stringLen; 1.2683 + } 1.2684 + 1.2685 + currentLen += stringLen; 1.2686 + atStart = FALSE; 1.2687 + } 1.2688 + break; 1.2689 + 1.2690 + case URX_STRING_I: 1.2691 + { 1.2692 + // Case-insensitive string. Unlike exact-match strings, we won't 1.2693 + // attempt a string search for possible match positions. But we 1.2694 + // do update the set of possible starting characters. 1.2695 + loc++; 1.2696 + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.2697 + int32_t stringLen = URX_VAL(stringLenOp); 1.2698 + U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 1.2699 + U_ASSERT(stringLenOp >= 2); 1.2700 + if (currentLen == 0) { 1.2701 + // Add the starting character of this string to the set of possible starting 1.2702 + // characters for this pattern. 1.2703 + int32_t stringStartIdx = URX_VAL(op); 1.2704 + UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 1.2705 + UnicodeSet s(c, c); 1.2706 + 1.2707 + // TODO: compute correct set of starting chars for full case folding. 1.2708 + // For the moment, say any char can start. 1.2709 + // s.closeOver(USET_CASE_INSENSITIVE); 1.2710 + s.clear(); 1.2711 + s.complement(); 1.2712 + 1.2713 + fRXPat->fInitialChars->addAll(s); 1.2714 + numInitialStrings += 2; // Matching on an initial string not possible. 1.2715 + } 1.2716 + currentLen += stringLen; 1.2717 + atStart = FALSE; 1.2718 + } 1.2719 + break; 1.2720 + 1.2721 + case URX_CTR_INIT: 1.2722 + case URX_CTR_INIT_NG: 1.2723 + { 1.2724 + // Loop Init Ops. These don't change the min length, but they are 4 word ops 1.2725 + // so location must be updated accordingly. 1.2726 + // Loop Init Ops. 1.2727 + // If the min loop count == 0 1.2728 + // move loc forwards to the end of the loop, skipping over the body. 1.2729 + // If the min count is > 0, 1.2730 + // continue normal processing of the body of the loop. 1.2731 + int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 1.2732 + loopEndLoc = URX_VAL(loopEndLoc); 1.2733 + int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 1.2734 + if (minLoopCount == 0) { 1.2735 + // Min Loop Count of 0, treat like a forward branch and 1.2736 + // move the current minimum length up to the target 1.2737 + // (end of loop) location. 1.2738 + U_ASSERT(loopEndLoc <= end+1); 1.2739 + if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 1.2740 + forwardedLength.setElementAt(currentLen, loopEndLoc); 1.2741 + } 1.2742 + } 1.2743 + loc+=3; // Skips over operands of CTR_INIT 1.2744 + } 1.2745 + atStart = FALSE; 1.2746 + break; 1.2747 + 1.2748 + 1.2749 + case URX_CTR_LOOP: 1.2750 + case URX_CTR_LOOP_NG: 1.2751 + // Loop ops. 1.2752 + // The jump is conditional, backwards only. 1.2753 + atStart = FALSE; 1.2754 + break; 1.2755 + 1.2756 + case URX_LOOP_C: 1.2757 + // More loop ops. These state-save to themselves. 1.2758 + // don't change the minimum match 1.2759 + atStart = FALSE; 1.2760 + break; 1.2761 + 1.2762 + 1.2763 + case URX_LA_START: 1.2764 + case URX_LB_START: 1.2765 + { 1.2766 + // Look-around. Scan forward until the matching look-ahead end, 1.2767 + // without processing the look-around block. This is overly pessimistic. 1.2768 + 1.2769 + // Keep track of the nesting depth of look-around blocks. Boilerplate code for 1.2770 + // lookahead contains two LA_END instructions, so count goes up by two 1.2771 + // for each LA_START. 1.2772 + int32_t depth = (opType == URX_LA_START? 2: 1); 1.2773 + for (;;) { 1.2774 + loc++; 1.2775 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.2776 + if (URX_TYPE(op) == URX_LA_START) { 1.2777 + depth+=2; 1.2778 + } 1.2779 + if (URX_TYPE(op) == URX_LB_START) { 1.2780 + depth++; 1.2781 + } 1.2782 + if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 1.2783 + depth--; 1.2784 + if (depth == 0) { 1.2785 + break; 1.2786 + } 1.2787 + } 1.2788 + if (URX_TYPE(op) == URX_STATE_SAVE) { 1.2789 + // Need this because neg lookahead blocks will FAIL to outside 1.2790 + // of the block. 1.2791 + int32_t jmpDest = URX_VAL(op); 1.2792 + if (jmpDest > loc) { 1.2793 + if (currentLen < forwardedLength.elementAti(jmpDest)) { 1.2794 + forwardedLength.setElementAt(currentLen, jmpDest); 1.2795 + } 1.2796 + } 1.2797 + } 1.2798 + U_ASSERT(loc <= end); 1.2799 + } 1.2800 + } 1.2801 + break; 1.2802 + 1.2803 + case URX_LA_END: 1.2804 + case URX_LB_CONT: 1.2805 + case URX_LB_END: 1.2806 + case URX_LBN_CONT: 1.2807 + case URX_LBN_END: 1.2808 + U_ASSERT(FALSE); // Shouldn't get here. These ops should be 1.2809 + // consumed by the scan in URX_LA_START and LB_START 1.2810 + 1.2811 + break; 1.2812 + 1.2813 + default: 1.2814 + U_ASSERT(FALSE); 1.2815 + } 1.2816 + 1.2817 + } 1.2818 + 1.2819 + 1.2820 + // We have finished walking through the ops. Check whether some forward jump 1.2821 + // propagated a shorter length to location end+1. 1.2822 + if (forwardedLength.elementAti(end+1) < currentLen) { 1.2823 + currentLen = forwardedLength.elementAti(end+1); 1.2824 + } 1.2825 + 1.2826 + 1.2827 + fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 1.2828 + 1.2829 + 1.2830 + // Sort out what we should check for when looking for candidate match start positions. 1.2831 + // In order of preference, 1.2832 + // 1. Start of input text buffer. 1.2833 + // 2. A literal string. 1.2834 + // 3. Start of line in multi-line mode. 1.2835 + // 4. A single literal character. 1.2836 + // 5. A character from a set of characters. 1.2837 + // 1.2838 + if (fRXPat->fStartType == START_START) { 1.2839 + // Match only at the start of an input text string. 1.2840 + // start type is already set. We're done. 1.2841 + } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 1.2842 + // Match beginning only with a literal string. 1.2843 + UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 1.2844 + U_ASSERT(fRXPat->fInitialChars->contains(c)); 1.2845 + fRXPat->fStartType = START_STRING; 1.2846 + fRXPat->fInitialChar = c; 1.2847 + } else if (fRXPat->fStartType == START_LINE) { 1.2848 + // Match at start of line in Multi-Line mode. 1.2849 + // Nothing to do here; everything is already set. 1.2850 + } else if (fRXPat->fMinMatchLen == 0) { 1.2851 + // Zero length match possible. We could start anywhere. 1.2852 + fRXPat->fStartType = START_NO_INFO; 1.2853 + } else if (fRXPat->fInitialChars->size() == 1) { 1.2854 + // All matches begin with the same char. 1.2855 + fRXPat->fStartType = START_CHAR; 1.2856 + fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 1.2857 + U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 1.2858 + } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 1.2859 + fRXPat->fMinMatchLen > 0) { 1.2860 + // Matches start with a set of character smaller than the set of all chars. 1.2861 + fRXPat->fStartType = START_SET; 1.2862 + } else { 1.2863 + // Matches can start with anything 1.2864 + fRXPat->fStartType = START_NO_INFO; 1.2865 + } 1.2866 + 1.2867 + return; 1.2868 +} 1.2869 + 1.2870 + 1.2871 + 1.2872 +//------------------------------------------------------------------------------ 1.2873 +// 1.2874 +// minMatchLength Calculate the length of the shortest string that could 1.2875 +// match the specified pattern. 1.2876 +// Length is in 16 bit code units, not code points. 1.2877 +// 1.2878 +// The calculated length may not be exact. The returned 1.2879 +// value may be shorter than the actual minimum; it must 1.2880 +// never be longer. 1.2881 +// 1.2882 +// start and end are the range of p-code operations to be 1.2883 +// examined. The endpoints are included in the range. 1.2884 +// 1.2885 +//------------------------------------------------------------------------------ 1.2886 +int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 1.2887 + if (U_FAILURE(*fStatus)) { 1.2888 + return 0; 1.2889 + } 1.2890 + 1.2891 + U_ASSERT(start <= end); 1.2892 + U_ASSERT(end < fRXPat->fCompiledPat->size()); 1.2893 + 1.2894 + 1.2895 + int32_t loc; 1.2896 + int32_t op; 1.2897 + int32_t opType; 1.2898 + int32_t currentLen = 0; 1.2899 + 1.2900 + 1.2901 + // forwardedLength is a vector holding minimum-match-length values that 1.2902 + // are propagated forward in the pattern by JMP or STATE_SAVE operations. 1.2903 + // It must be one longer than the pattern being checked because some ops 1.2904 + // will jmp to a end-of-block+1 location from within a block, and we must 1.2905 + // count those when checking the block. 1.2906 + UVector32 forwardedLength(end+2, *fStatus); 1.2907 + forwardedLength.setSize(end+2); 1.2908 + for (loc=start; loc<=end+1; loc++) { 1.2909 + forwardedLength.setElementAt(INT32_MAX, loc); 1.2910 + } 1.2911 + 1.2912 + for (loc = start; loc<=end; loc++) { 1.2913 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.2914 + opType = URX_TYPE(op); 1.2915 + 1.2916 + // The loop is advancing linearly through the pattern. 1.2917 + // If the op we are now at was the destination of a branch in the pattern, 1.2918 + // and that path has a shorter minimum length than the current accumulated value, 1.2919 + // replace the current accumulated value. 1.2920 + // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 1.2921 + // no-match-possible cases. 1.2922 + if (forwardedLength.elementAti(loc) < currentLen) { 1.2923 + currentLen = forwardedLength.elementAti(loc); 1.2924 + U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 1.2925 + } 1.2926 + 1.2927 + switch (opType) { 1.2928 + // Ops that don't change the total length matched 1.2929 + case URX_RESERVED_OP: 1.2930 + case URX_END: 1.2931 + case URX_STRING_LEN: 1.2932 + case URX_NOP: 1.2933 + case URX_START_CAPTURE: 1.2934 + case URX_END_CAPTURE: 1.2935 + case URX_BACKSLASH_B: 1.2936 + case URX_BACKSLASH_BU: 1.2937 + case URX_BACKSLASH_G: 1.2938 + case URX_BACKSLASH_Z: 1.2939 + case URX_CARET: 1.2940 + case URX_DOLLAR: 1.2941 + case URX_DOLLAR_M: 1.2942 + case URX_DOLLAR_D: 1.2943 + case URX_DOLLAR_MD: 1.2944 + case URX_RELOC_OPRND: 1.2945 + case URX_STO_INP_LOC: 1.2946 + case URX_CARET_M: 1.2947 + case URX_CARET_M_UNIX: 1.2948 + case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 1.2949 + case URX_BACKREF_I: 1.2950 + 1.2951 + case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 1.2952 + case URX_LD_SP: 1.2953 + 1.2954 + case URX_JMP_SAV: 1.2955 + case URX_JMP_SAV_X: 1.2956 + break; 1.2957 + 1.2958 + 1.2959 + // Ops that match a minimum of one character (one or two 16 bit code units.) 1.2960 + // 1.2961 + case URX_ONECHAR: 1.2962 + case URX_STATIC_SETREF: 1.2963 + case URX_STAT_SETREF_N: 1.2964 + case URX_SETREF: 1.2965 + case URX_BACKSLASH_D: 1.2966 + case URX_ONECHAR_I: 1.2967 + case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 1.2968 + case URX_DOTANY_ALL: // . matches one or two. 1.2969 + case URX_DOTANY: 1.2970 + case URX_DOTANY_UNIX: 1.2971 + currentLen++; 1.2972 + break; 1.2973 + 1.2974 + 1.2975 + case URX_JMPX: 1.2976 + loc++; // URX_JMPX has an extra operand, ignored here, 1.2977 + // otherwise processed identically to URX_JMP. 1.2978 + case URX_JMP: 1.2979 + { 1.2980 + int32_t jmpDest = URX_VAL(op); 1.2981 + if (jmpDest < loc) { 1.2982 + // Loop of some kind. Can safely ignore, the worst that will happen 1.2983 + // is that we understate the true minimum length 1.2984 + currentLen = forwardedLength.elementAti(loc+1); 1.2985 + } else { 1.2986 + // Forward jump. Propagate the current min length to the target loc of the jump. 1.2987 + U_ASSERT(jmpDest <= end+1); 1.2988 + if (forwardedLength.elementAti(jmpDest) > currentLen) { 1.2989 + forwardedLength.setElementAt(currentLen, jmpDest); 1.2990 + } 1.2991 + } 1.2992 + } 1.2993 + break; 1.2994 + 1.2995 + case URX_BACKTRACK: 1.2996 + { 1.2997 + // Back-tracks are kind of like a branch, except that the min length was 1.2998 + // propagated already, by the state save. 1.2999 + currentLen = forwardedLength.elementAti(loc+1); 1.3000 + } 1.3001 + break; 1.3002 + 1.3003 + 1.3004 + case URX_STATE_SAVE: 1.3005 + { 1.3006 + // State Save, for forward jumps, propagate the current minimum. 1.3007 + // of the state save. 1.3008 + int32_t jmpDest = URX_VAL(op); 1.3009 + if (jmpDest > loc) { 1.3010 + if (currentLen < forwardedLength.elementAti(jmpDest)) { 1.3011 + forwardedLength.setElementAt(currentLen, jmpDest); 1.3012 + } 1.3013 + } 1.3014 + } 1.3015 + break; 1.3016 + 1.3017 + 1.3018 + case URX_STRING: 1.3019 + { 1.3020 + loc++; 1.3021 + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3022 + currentLen += URX_VAL(stringLenOp); 1.3023 + } 1.3024 + break; 1.3025 + 1.3026 + 1.3027 + case URX_STRING_I: 1.3028 + { 1.3029 + loc++; 1.3030 + // TODO: with full case folding, matching input text may be shorter than 1.3031 + // the string we have here. More smarts could put some bounds on it. 1.3032 + // Assume a min length of one for now. A min length of zero causes 1.3033 + // optimization failures for a pattern like "string"+ 1.3034 + // currentLen += URX_VAL(stringLenOp); 1.3035 + currentLen += 1; 1.3036 + } 1.3037 + break; 1.3038 + 1.3039 + case URX_CTR_INIT: 1.3040 + case URX_CTR_INIT_NG: 1.3041 + { 1.3042 + // Loop Init Ops. 1.3043 + // If the min loop count == 0 1.3044 + // move loc forwards to the end of the loop, skipping over the body. 1.3045 + // If the min count is > 0, 1.3046 + // continue normal processing of the body of the loop. 1.3047 + int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 1.3048 + loopEndLoc = URX_VAL(loopEndLoc); 1.3049 + int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 1.3050 + if (minLoopCount == 0) { 1.3051 + loc = loopEndLoc; 1.3052 + } else { 1.3053 + loc+=3; // Skips over operands of CTR_INIT 1.3054 + } 1.3055 + } 1.3056 + break; 1.3057 + 1.3058 + 1.3059 + case URX_CTR_LOOP: 1.3060 + case URX_CTR_LOOP_NG: 1.3061 + // Loop ops. 1.3062 + // The jump is conditional, backwards only. 1.3063 + break; 1.3064 + 1.3065 + case URX_LOOP_SR_I: 1.3066 + case URX_LOOP_DOT_I: 1.3067 + case URX_LOOP_C: 1.3068 + // More loop ops. These state-save to themselves. 1.3069 + // don't change the minimum match - could match nothing at all. 1.3070 + break; 1.3071 + 1.3072 + 1.3073 + case URX_LA_START: 1.3074 + case URX_LB_START: 1.3075 + { 1.3076 + // Look-around. Scan forward until the matching look-ahead end, 1.3077 + // without processing the look-around block. This is overly pessimistic for look-ahead, 1.3078 + // it assumes that the look-ahead match might be zero-length. 1.3079 + // TODO: Positive lookahead could recursively do the block, then continue 1.3080 + // with the longer of the block or the value coming in. Ticket 6060 1.3081 + int32_t depth = (opType == URX_LA_START? 2: 1);; 1.3082 + for (;;) { 1.3083 + loc++; 1.3084 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3085 + if (URX_TYPE(op) == URX_LA_START) { 1.3086 + // The boilerplate for look-ahead includes two LA_END insturctions, 1.3087 + // Depth will be decremented by each one when it is seen. 1.3088 + depth += 2; 1.3089 + } 1.3090 + if (URX_TYPE(op) == URX_LB_START) { 1.3091 + depth++; 1.3092 + } 1.3093 + if (URX_TYPE(op) == URX_LA_END) { 1.3094 + depth--; 1.3095 + if (depth == 0) { 1.3096 + break; 1.3097 + } 1.3098 + } 1.3099 + if (URX_TYPE(op)==URX_LBN_END) { 1.3100 + depth--; 1.3101 + if (depth == 0) { 1.3102 + break; 1.3103 + } 1.3104 + } 1.3105 + if (URX_TYPE(op) == URX_STATE_SAVE) { 1.3106 + // Need this because neg lookahead blocks will FAIL to outside 1.3107 + // of the block. 1.3108 + int32_t jmpDest = URX_VAL(op); 1.3109 + if (jmpDest > loc) { 1.3110 + if (currentLen < forwardedLength.elementAti(jmpDest)) { 1.3111 + forwardedLength.setElementAt(currentLen, jmpDest); 1.3112 + } 1.3113 + } 1.3114 + } 1.3115 + U_ASSERT(loc <= end); 1.3116 + } 1.3117 + } 1.3118 + break; 1.3119 + 1.3120 + case URX_LA_END: 1.3121 + case URX_LB_CONT: 1.3122 + case URX_LB_END: 1.3123 + case URX_LBN_CONT: 1.3124 + case URX_LBN_END: 1.3125 + // Only come here if the matching URX_LA_START or URX_LB_START was not in the 1.3126 + // range being sized, which happens when measuring size of look-behind blocks. 1.3127 + break; 1.3128 + 1.3129 + default: 1.3130 + U_ASSERT(FALSE); 1.3131 + } 1.3132 + 1.3133 + } 1.3134 + 1.3135 + // We have finished walking through the ops. Check whether some forward jump 1.3136 + // propagated a shorter length to location end+1. 1.3137 + if (forwardedLength.elementAti(end+1) < currentLen) { 1.3138 + currentLen = forwardedLength.elementAti(end+1); 1.3139 + U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 1.3140 + } 1.3141 + 1.3142 + return currentLen; 1.3143 +} 1.3144 + 1.3145 +// Increment with overflow check. 1.3146 +// val and delta will both be positive. 1.3147 + 1.3148 +static int32_t safeIncrement(int32_t val, int32_t delta) { 1.3149 + if (INT32_MAX - val > delta) { 1.3150 + return val + delta; 1.3151 + } else { 1.3152 + return INT32_MAX; 1.3153 + } 1.3154 +} 1.3155 + 1.3156 + 1.3157 +//------------------------------------------------------------------------------ 1.3158 +// 1.3159 +// maxMatchLength Calculate the length of the longest string that could 1.3160 +// match the specified pattern. 1.3161 +// Length is in 16 bit code units, not code points. 1.3162 +// 1.3163 +// The calculated length may not be exact. The returned 1.3164 +// value may be longer than the actual maximum; it must 1.3165 +// never be shorter. 1.3166 +// 1.3167 +//------------------------------------------------------------------------------ 1.3168 +int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 1.3169 + if (U_FAILURE(*fStatus)) { 1.3170 + return 0; 1.3171 + } 1.3172 + U_ASSERT(start <= end); 1.3173 + U_ASSERT(end < fRXPat->fCompiledPat->size()); 1.3174 + 1.3175 + 1.3176 + int32_t loc; 1.3177 + int32_t op; 1.3178 + int32_t opType; 1.3179 + int32_t currentLen = 0; 1.3180 + UVector32 forwardedLength(end+1, *fStatus); 1.3181 + forwardedLength.setSize(end+1); 1.3182 + 1.3183 + for (loc=start; loc<=end; loc++) { 1.3184 + forwardedLength.setElementAt(0, loc); 1.3185 + } 1.3186 + 1.3187 + for (loc = start; loc<=end; loc++) { 1.3188 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3189 + opType = URX_TYPE(op); 1.3190 + 1.3191 + // The loop is advancing linearly through the pattern. 1.3192 + // If the op we are now at was the destination of a branch in the pattern, 1.3193 + // and that path has a longer maximum length than the current accumulated value, 1.3194 + // replace the current accumulated value. 1.3195 + if (forwardedLength.elementAti(loc) > currentLen) { 1.3196 + currentLen = forwardedLength.elementAti(loc); 1.3197 + } 1.3198 + 1.3199 + switch (opType) { 1.3200 + // Ops that don't change the total length matched 1.3201 + case URX_RESERVED_OP: 1.3202 + case URX_END: 1.3203 + case URX_STRING_LEN: 1.3204 + case URX_NOP: 1.3205 + case URX_START_CAPTURE: 1.3206 + case URX_END_CAPTURE: 1.3207 + case URX_BACKSLASH_B: 1.3208 + case URX_BACKSLASH_BU: 1.3209 + case URX_BACKSLASH_G: 1.3210 + case URX_BACKSLASH_Z: 1.3211 + case URX_CARET: 1.3212 + case URX_DOLLAR: 1.3213 + case URX_DOLLAR_M: 1.3214 + case URX_DOLLAR_D: 1.3215 + case URX_DOLLAR_MD: 1.3216 + case URX_RELOC_OPRND: 1.3217 + case URX_STO_INP_LOC: 1.3218 + case URX_CARET_M: 1.3219 + case URX_CARET_M_UNIX: 1.3220 + 1.3221 + case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 1.3222 + case URX_LD_SP: 1.3223 + 1.3224 + case URX_LB_END: 1.3225 + case URX_LB_CONT: 1.3226 + case URX_LBN_CONT: 1.3227 + case URX_LBN_END: 1.3228 + break; 1.3229 + 1.3230 + 1.3231 + // Ops that increase that cause an unbounded increase in the length 1.3232 + // of a matched string, or that increase it a hard to characterize way. 1.3233 + // Call the max length unbounded, and stop further checking. 1.3234 + case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 1.3235 + case URX_BACKREF_I: 1.3236 + case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 1.3237 + currentLen = INT32_MAX; 1.3238 + break; 1.3239 + 1.3240 + 1.3241 + // Ops that match a max of one character (possibly two 16 bit code units.) 1.3242 + // 1.3243 + case URX_STATIC_SETREF: 1.3244 + case URX_STAT_SETREF_N: 1.3245 + case URX_SETREF: 1.3246 + case URX_BACKSLASH_D: 1.3247 + case URX_ONECHAR_I: 1.3248 + case URX_DOTANY_ALL: 1.3249 + case URX_DOTANY: 1.3250 + case URX_DOTANY_UNIX: 1.3251 + currentLen = safeIncrement(currentLen, 2); 1.3252 + break; 1.3253 + 1.3254 + // Single literal character. Increase current max length by one or two, 1.3255 + // depending on whether the char is in the supplementary range. 1.3256 + case URX_ONECHAR: 1.3257 + currentLen = safeIncrement(currentLen, 1); 1.3258 + if (URX_VAL(op) > 0x10000) { 1.3259 + currentLen = safeIncrement(currentLen, 1); 1.3260 + } 1.3261 + break; 1.3262 + 1.3263 + // Jumps. 1.3264 + // 1.3265 + case URX_JMP: 1.3266 + case URX_JMPX: 1.3267 + case URX_JMP_SAV: 1.3268 + case URX_JMP_SAV_X: 1.3269 + { 1.3270 + int32_t jmpDest = URX_VAL(op); 1.3271 + if (jmpDest < loc) { 1.3272 + // Loop of some kind. Max match length is unbounded. 1.3273 + currentLen = INT32_MAX; 1.3274 + } else { 1.3275 + // Forward jump. Propagate the current min length to the target loc of the jump. 1.3276 + if (forwardedLength.elementAti(jmpDest) < currentLen) { 1.3277 + forwardedLength.setElementAt(currentLen, jmpDest); 1.3278 + } 1.3279 + currentLen = 0; 1.3280 + } 1.3281 + } 1.3282 + break; 1.3283 + 1.3284 + case URX_BACKTRACK: 1.3285 + // back-tracks are kind of like a branch, except that the max length was 1.3286 + // propagated already, by the state save. 1.3287 + currentLen = forwardedLength.elementAti(loc+1); 1.3288 + break; 1.3289 + 1.3290 + 1.3291 + case URX_STATE_SAVE: 1.3292 + { 1.3293 + // State Save, for forward jumps, propagate the current minimum. 1.3294 + // of the state save. 1.3295 + // For backwards jumps, they create a loop, maximum 1.3296 + // match length is unbounded. 1.3297 + int32_t jmpDest = URX_VAL(op); 1.3298 + if (jmpDest > loc) { 1.3299 + if (currentLen > forwardedLength.elementAti(jmpDest)) { 1.3300 + forwardedLength.setElementAt(currentLen, jmpDest); 1.3301 + } 1.3302 + } else { 1.3303 + currentLen = INT32_MAX; 1.3304 + } 1.3305 + } 1.3306 + break; 1.3307 + 1.3308 + 1.3309 + 1.3310 + 1.3311 + case URX_STRING: 1.3312 + { 1.3313 + loc++; 1.3314 + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3315 + currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 1.3316 + break; 1.3317 + } 1.3318 + 1.3319 + case URX_STRING_I: 1.3320 + // TODO: This code assumes that any user string that matches will be no longer 1.3321 + // than our compiled string, with case insensitive matching. 1.3322 + // Our compiled string has been case-folded already. 1.3323 + // 1.3324 + // Any matching user string will have no more code points than our 1.3325 + // compiled (folded) string. Folding may add code points, but 1.3326 + // not remove them. 1.3327 + // 1.3328 + // There is a potential problem if a supplemental code point 1.3329 + // case-folds to a BMP code point. In this case our compiled string 1.3330 + // could be shorter (in code units) than a matching user string. 1.3331 + // 1.3332 + // At this time (Unicode 6.1) there are no such characters, and this case 1.3333 + // is not being handled. A test, intltest regex/Bug9283, will fail if 1.3334 + // any problematic characters are added to Unicode. 1.3335 + // 1.3336 + // If this happens, we can make a set of the BMP chars that the 1.3337 + // troublesome supplementals fold to, scan our string, and bump the 1.3338 + // currentLen one extra for each that is found. 1.3339 + // 1.3340 + { 1.3341 + loc++; 1.3342 + int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3343 + currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 1.3344 + } 1.3345 + break; 1.3346 + 1.3347 + case URX_CTR_INIT: 1.3348 + case URX_CTR_INIT_NG: 1.3349 + // For Loops, recursively call this function on the pattern for the loop body, 1.3350 + // then multiply the result by the maximum loop count. 1.3351 + { 1.3352 + int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); 1.3353 + if (loopEndLoc == loc+4) { 1.3354 + // Loop has an empty body. No affect on max match length. 1.3355 + // Continue processing with code after the loop end. 1.3356 + loc = loopEndLoc; 1.3357 + break; 1.3358 + } 1.3359 + 1.3360 + int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); 1.3361 + if (maxLoopCount == -1) { 1.3362 + // Unbounded Loop. No upper bound on match length. 1.3363 + currentLen = INT32_MAX; 1.3364 + break; 1.3365 + } 1.3366 + 1.3367 + U_ASSERT(loopEndLoc >= loc+4); 1.3368 + int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. 1.3369 + if (blockLen == INT32_MAX) { 1.3370 + currentLen = blockLen; 1.3371 + break; 1.3372 + } 1.3373 + currentLen += blockLen * maxLoopCount; 1.3374 + loc = loopEndLoc; 1.3375 + break; 1.3376 + } 1.3377 + 1.3378 + case URX_CTR_LOOP: 1.3379 + case URX_CTR_LOOP_NG: 1.3380 + // These opcodes will be skipped over by code for URX_CRT_INIT. 1.3381 + // We shouldn't encounter them here. 1.3382 + U_ASSERT(FALSE); 1.3383 + break; 1.3384 + 1.3385 + case URX_LOOP_SR_I: 1.3386 + case URX_LOOP_DOT_I: 1.3387 + case URX_LOOP_C: 1.3388 + // For anything to do with loops, make the match length unbounded. 1.3389 + currentLen = INT32_MAX; 1.3390 + break; 1.3391 + 1.3392 + 1.3393 + 1.3394 + case URX_LA_START: 1.3395 + case URX_LA_END: 1.3396 + // Look-ahead. Just ignore, treat the look-ahead block as if 1.3397 + // it were normal pattern. Gives a too-long match length, 1.3398 + // but good enough for now. 1.3399 + break; 1.3400 + 1.3401 + // End of look-ahead ops should always be consumed by the processing at 1.3402 + // the URX_LA_START op. 1.3403 + // U_ASSERT(FALSE); 1.3404 + // break; 1.3405 + 1.3406 + case URX_LB_START: 1.3407 + { 1.3408 + // Look-behind. Scan forward until the matching look-around end, 1.3409 + // without processing the look-behind block. 1.3410 + int32_t depth = 0; 1.3411 + for (;;) { 1.3412 + loc++; 1.3413 + op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3414 + if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 1.3415 + depth++; 1.3416 + } 1.3417 + if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 1.3418 + if (depth == 0) { 1.3419 + break; 1.3420 + } 1.3421 + depth--; 1.3422 + } 1.3423 + U_ASSERT(loc < end); 1.3424 + } 1.3425 + } 1.3426 + break; 1.3427 + 1.3428 + default: 1.3429 + U_ASSERT(FALSE); 1.3430 + } 1.3431 + 1.3432 + 1.3433 + if (currentLen == INT32_MAX) { 1.3434 + // The maximum length is unbounded. 1.3435 + // Stop further processing of the pattern. 1.3436 + break; 1.3437 + } 1.3438 + 1.3439 + } 1.3440 + return currentLen; 1.3441 + 1.3442 +} 1.3443 + 1.3444 + 1.3445 +//------------------------------------------------------------------------------ 1.3446 +// 1.3447 +// stripNOPs Remove any NOP operations from the compiled pattern code. 1.3448 +// Extra NOPs are inserted for some constructs during the initial 1.3449 +// code generation to provide locations that may be patched later. 1.3450 +// Many end up unneeded, and are removed by this function. 1.3451 +// 1.3452 +// In order to minimize the number of passes through the pattern, 1.3453 +// back-reference fixup is also performed here (adjusting 1.3454 +// back-reference operands to point to the correct frame offsets). 1.3455 +// 1.3456 +//------------------------------------------------------------------------------ 1.3457 +void RegexCompile::stripNOPs() { 1.3458 + 1.3459 + if (U_FAILURE(*fStatus)) { 1.3460 + return; 1.3461 + } 1.3462 + 1.3463 + int32_t end = fRXPat->fCompiledPat->size(); 1.3464 + UVector32 deltas(end, *fStatus); 1.3465 + 1.3466 + // Make a first pass over the code, computing the amount that things 1.3467 + // will be offset at each location in the original code. 1.3468 + int32_t loc; 1.3469 + int32_t d = 0; 1.3470 + for (loc=0; loc<end; loc++) { 1.3471 + deltas.addElement(d, *fStatus); 1.3472 + int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 1.3473 + if (URX_TYPE(op) == URX_NOP) { 1.3474 + d++; 1.3475 + } 1.3476 + } 1.3477 + 1.3478 + UnicodeString caseStringBuffer; 1.3479 + 1.3480 + // Make a second pass over the code, removing the NOPs by moving following 1.3481 + // code up, and patching operands that refer to code locations that 1.3482 + // are being moved. The array of offsets from the first step is used 1.3483 + // to compute the new operand values. 1.3484 + int32_t src; 1.3485 + int32_t dst = 0; 1.3486 + for (src=0; src<end; src++) { 1.3487 + int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 1.3488 + int32_t opType = URX_TYPE(op); 1.3489 + switch (opType) { 1.3490 + case URX_NOP: 1.3491 + break; 1.3492 + 1.3493 + case URX_STATE_SAVE: 1.3494 + case URX_JMP: 1.3495 + case URX_CTR_LOOP: 1.3496 + case URX_CTR_LOOP_NG: 1.3497 + case URX_RELOC_OPRND: 1.3498 + case URX_JMPX: 1.3499 + case URX_JMP_SAV: 1.3500 + case URX_JMP_SAV_X: 1.3501 + // These are instructions with operands that refer to code locations. 1.3502 + { 1.3503 + int32_t operandAddress = URX_VAL(op); 1.3504 + U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 1.3505 + int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 1.3506 + op = URX_BUILD(opType, fixedOperandAddress); 1.3507 + fRXPat->fCompiledPat->setElementAt(op, dst); 1.3508 + dst++; 1.3509 + break; 1.3510 + } 1.3511 + 1.3512 + case URX_BACKREF: 1.3513 + case URX_BACKREF_I: 1.3514 + { 1.3515 + int32_t where = URX_VAL(op); 1.3516 + if (where > fRXPat->fGroupMap->size()) { 1.3517 + error(U_REGEX_INVALID_BACK_REF); 1.3518 + break; 1.3519 + } 1.3520 + where = fRXPat->fGroupMap->elementAti(where-1); 1.3521 + op = URX_BUILD(opType, where); 1.3522 + fRXPat->fCompiledPat->setElementAt(op, dst); 1.3523 + dst++; 1.3524 + 1.3525 + fRXPat->fNeedsAltInput = TRUE; 1.3526 + break; 1.3527 + } 1.3528 + case URX_RESERVED_OP: 1.3529 + case URX_RESERVED_OP_N: 1.3530 + case URX_BACKTRACK: 1.3531 + case URX_END: 1.3532 + case URX_ONECHAR: 1.3533 + case URX_STRING: 1.3534 + case URX_STRING_LEN: 1.3535 + case URX_START_CAPTURE: 1.3536 + case URX_END_CAPTURE: 1.3537 + case URX_STATIC_SETREF: 1.3538 + case URX_STAT_SETREF_N: 1.3539 + case URX_SETREF: 1.3540 + case URX_DOTANY: 1.3541 + case URX_FAIL: 1.3542 + case URX_BACKSLASH_B: 1.3543 + case URX_BACKSLASH_BU: 1.3544 + case URX_BACKSLASH_G: 1.3545 + case URX_BACKSLASH_X: 1.3546 + case URX_BACKSLASH_Z: 1.3547 + case URX_DOTANY_ALL: 1.3548 + case URX_BACKSLASH_D: 1.3549 + case URX_CARET: 1.3550 + case URX_DOLLAR: 1.3551 + case URX_CTR_INIT: 1.3552 + case URX_CTR_INIT_NG: 1.3553 + case URX_DOTANY_UNIX: 1.3554 + case URX_STO_SP: 1.3555 + case URX_LD_SP: 1.3556 + case URX_STO_INP_LOC: 1.3557 + case URX_LA_START: 1.3558 + case URX_LA_END: 1.3559 + case URX_ONECHAR_I: 1.3560 + case URX_STRING_I: 1.3561 + case URX_DOLLAR_M: 1.3562 + case URX_CARET_M: 1.3563 + case URX_CARET_M_UNIX: 1.3564 + case URX_LB_START: 1.3565 + case URX_LB_CONT: 1.3566 + case URX_LB_END: 1.3567 + case URX_LBN_CONT: 1.3568 + case URX_LBN_END: 1.3569 + case URX_LOOP_SR_I: 1.3570 + case URX_LOOP_DOT_I: 1.3571 + case URX_LOOP_C: 1.3572 + case URX_DOLLAR_D: 1.3573 + case URX_DOLLAR_MD: 1.3574 + // These instructions are unaltered by the relocation. 1.3575 + fRXPat->fCompiledPat->setElementAt(op, dst); 1.3576 + dst++; 1.3577 + break; 1.3578 + 1.3579 + default: 1.3580 + // Some op is unaccounted for. 1.3581 + U_ASSERT(FALSE); 1.3582 + error(U_REGEX_INTERNAL_ERROR); 1.3583 + } 1.3584 + } 1.3585 + 1.3586 + fRXPat->fCompiledPat->setSize(dst); 1.3587 +} 1.3588 + 1.3589 + 1.3590 + 1.3591 + 1.3592 +//------------------------------------------------------------------------------ 1.3593 +// 1.3594 +// Error Report a rule parse error. 1.3595 +// Only report it if no previous error has been recorded. 1.3596 +// 1.3597 +//------------------------------------------------------------------------------ 1.3598 +void RegexCompile::error(UErrorCode e) { 1.3599 + if (U_SUCCESS(*fStatus)) { 1.3600 + *fStatus = e; 1.3601 + // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 1.3602 + // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 1.3603 + // int64_t. If the values of the latter are out of range for the former, 1.3604 + // set them to the appropriate "field not supported" values. 1.3605 + if (fLineNum > 0x7FFFFFFF) { 1.3606 + fParseErr->line = 0; 1.3607 + fParseErr->offset = -1; 1.3608 + } else if (fCharNum > 0x7FFFFFFF) { 1.3609 + fParseErr->line = (int32_t)fLineNum; 1.3610 + fParseErr->offset = -1; 1.3611 + } else { 1.3612 + fParseErr->line = (int32_t)fLineNum; 1.3613 + fParseErr->offset = (int32_t)fCharNum; 1.3614 + } 1.3615 + 1.3616 + UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 1.3617 + 1.3618 + // Fill in the context. 1.3619 + // Note: extractBetween() pins supplied indicies to the string bounds. 1.3620 + uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 1.3621 + uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 1.3622 + utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 1.3623 + utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 1.3624 + } 1.3625 +} 1.3626 + 1.3627 + 1.3628 +// 1.3629 +// Assorted Unicode character constants. 1.3630 +// Numeric because there is no portable way to enter them as literals. 1.3631 +// (Think EBCDIC). 1.3632 +// 1.3633 +static const UChar chCR = 0x0d; // New lines, for terminating comments. 1.3634 +static const UChar chLF = 0x0a; // Line Feed 1.3635 +static const UChar chPound = 0x23; // '#', introduces a comment. 1.3636 +static const UChar chDigit0 = 0x30; // '0' 1.3637 +static const UChar chDigit7 = 0x37; // '9' 1.3638 +static const UChar chColon = 0x3A; // ':' 1.3639 +static const UChar chE = 0x45; // 'E' 1.3640 +static const UChar chQ = 0x51; // 'Q' 1.3641 +//static const UChar chN = 0x4E; // 'N' 1.3642 +static const UChar chP = 0x50; // 'P' 1.3643 +static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 1.3644 +//static const UChar chLBracket = 0x5b; // '[' 1.3645 +static const UChar chRBracket = 0x5d; // ']' 1.3646 +static const UChar chUp = 0x5e; // '^' 1.3647 +static const UChar chLowerP = 0x70; 1.3648 +static const UChar chLBrace = 0x7b; // '{' 1.3649 +static const UChar chRBrace = 0x7d; // '}' 1.3650 +static const UChar chNEL = 0x85; // NEL newline variant 1.3651 +static const UChar chLS = 0x2028; // Unicode Line Separator 1.3652 + 1.3653 + 1.3654 +//------------------------------------------------------------------------------ 1.3655 +// 1.3656 +// nextCharLL Low Level Next Char from the regex pattern. 1.3657 +// Get a char from the string, keep track of input position 1.3658 +// for error reporting. 1.3659 +// 1.3660 +//------------------------------------------------------------------------------ 1.3661 +UChar32 RegexCompile::nextCharLL() { 1.3662 + UChar32 ch; 1.3663 + 1.3664 + if (fPeekChar != -1) { 1.3665 + ch = fPeekChar; 1.3666 + fPeekChar = -1; 1.3667 + return ch; 1.3668 + } 1.3669 + 1.3670 + // assume we're already in the right place 1.3671 + ch = UTEXT_NEXT32(fRXPat->fPattern); 1.3672 + if (ch == U_SENTINEL) { 1.3673 + return ch; 1.3674 + } 1.3675 + 1.3676 + if (ch == chCR || 1.3677 + ch == chNEL || 1.3678 + ch == chLS || 1.3679 + (ch == chLF && fLastChar != chCR)) { 1.3680 + // Character is starting a new line. Bump up the line number, and 1.3681 + // reset the column to 0. 1.3682 + fLineNum++; 1.3683 + fCharNum=0; 1.3684 + } 1.3685 + else { 1.3686 + // Character is not starting a new line. Except in the case of a 1.3687 + // LF following a CR, increment the column position. 1.3688 + if (ch != chLF) { 1.3689 + fCharNum++; 1.3690 + } 1.3691 + } 1.3692 + fLastChar = ch; 1.3693 + return ch; 1.3694 +} 1.3695 + 1.3696 +//------------------------------------------------------------------------------ 1.3697 +// 1.3698 +// peekCharLL Low Level Character Scanning, sneak a peek at the next 1.3699 +// character without actually getting it. 1.3700 +// 1.3701 +//------------------------------------------------------------------------------ 1.3702 +UChar32 RegexCompile::peekCharLL() { 1.3703 + if (fPeekChar == -1) { 1.3704 + fPeekChar = nextCharLL(); 1.3705 + } 1.3706 + return fPeekChar; 1.3707 +} 1.3708 + 1.3709 + 1.3710 +//------------------------------------------------------------------------------ 1.3711 +// 1.3712 +// nextChar for pattern scanning. At this level, we handle stripping 1.3713 +// out comments and processing some backslash character escapes. 1.3714 +// The rest of the pattern grammar is handled at the next level up. 1.3715 +// 1.3716 +//------------------------------------------------------------------------------ 1.3717 +void RegexCompile::nextChar(RegexPatternChar &c) { 1.3718 + 1.3719 + fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 1.3720 + c.fChar = nextCharLL(); 1.3721 + c.fQuoted = FALSE; 1.3722 + 1.3723 + if (fQuoteMode) { 1.3724 + c.fQuoted = TRUE; 1.3725 + if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 1.3726 + c.fChar == (UChar32)-1) { 1.3727 + fQuoteMode = FALSE; // Exit quote mode, 1.3728 + nextCharLL(); // discard the E 1.3729 + nextChar(c); // recurse to get the real next char 1.3730 + } 1.3731 + } 1.3732 + else if (fInBackslashQuote) { 1.3733 + // The current character immediately follows a '\' 1.3734 + // Don't check for any further escapes, just return it as-is. 1.3735 + // Don't set c.fQuoted, because that would prevent the state machine from 1.3736 + // dispatching on the character. 1.3737 + fInBackslashQuote = FALSE; 1.3738 + } 1.3739 + else 1.3740 + { 1.3741 + // We are not in a \Q quoted region \E of the source. 1.3742 + // 1.3743 + if (fModeFlags & UREGEX_COMMENTS) { 1.3744 + // 1.3745 + // We are in free-spacing and comments mode. 1.3746 + // Scan through any white space and comments, until we 1.3747 + // reach a significant character or the end of inut. 1.3748 + for (;;) { 1.3749 + if (c.fChar == (UChar32)-1) { 1.3750 + break; // End of Input 1.3751 + } 1.3752 + if (c.fChar == chPound && fEOLComments == TRUE) { 1.3753 + // Start of a comment. Consume the rest of it, until EOF or a new line 1.3754 + for (;;) { 1.3755 + c.fChar = nextCharLL(); 1.3756 + if (c.fChar == (UChar32)-1 || // EOF 1.3757 + c.fChar == chCR || 1.3758 + c.fChar == chLF || 1.3759 + c.fChar == chNEL || 1.3760 + c.fChar == chLS) { 1.3761 + break; 1.3762 + } 1.3763 + } 1.3764 + } 1.3765 + // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 1.3766 + if (PatternProps::isWhiteSpace(c.fChar) == FALSE) { 1.3767 + break; 1.3768 + } 1.3769 + c.fChar = nextCharLL(); 1.3770 + } 1.3771 + } 1.3772 + 1.3773 + // 1.3774 + // check for backslash escaped characters. 1.3775 + // 1.3776 + if (c.fChar == chBackSlash) { 1.3777 + int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 1.3778 + if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 1.3779 + // 1.3780 + // A '\' sequence that is handled by ICU's standard unescapeAt function. 1.3781 + // Includes \uxxxx, \n, \r, many others. 1.3782 + // Return the single equivalent character. 1.3783 + // 1.3784 + nextCharLL(); // get & discard the peeked char. 1.3785 + c.fQuoted = TRUE; 1.3786 + 1.3787 + if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 1.3788 + int32_t endIndex = (int32_t)pos; 1.3789 + c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 1.3790 + 1.3791 + if (endIndex == pos) { 1.3792 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.3793 + } 1.3794 + fCharNum += endIndex - pos; 1.3795 + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 1.3796 + } else { 1.3797 + int32_t offset = 0; 1.3798 + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 1.3799 + 1.3800 + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 1.3801 + c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 1.3802 + 1.3803 + if (offset == 0) { 1.3804 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.3805 + } else if (context.lastOffset == offset) { 1.3806 + UTEXT_PREVIOUS32(fRXPat->fPattern); 1.3807 + } else if (context.lastOffset != offset-1) { 1.3808 + utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 1.3809 + } 1.3810 + fCharNum += offset; 1.3811 + } 1.3812 + } 1.3813 + else if (peekCharLL() == chDigit0) { 1.3814 + // Octal Escape, using Java Regexp Conventions 1.3815 + // which are \0 followed by 1-3 octal digits. 1.3816 + // Different from ICU Unescape handling of Octal, which does not 1.3817 + // require the leading 0. 1.3818 + // Java also has the convention of only consuming 2 octal digits if 1.3819 + // the three digit number would be > 0xff 1.3820 + // 1.3821 + c.fChar = 0; 1.3822 + nextCharLL(); // Consume the initial 0. 1.3823 + int index; 1.3824 + for (index=0; index<3; index++) { 1.3825 + int32_t ch = peekCharLL(); 1.3826 + if (ch<chDigit0 || ch>chDigit7) { 1.3827 + if (index==0) { 1.3828 + // \0 is not followed by any octal digits. 1.3829 + error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1.3830 + } 1.3831 + break; 1.3832 + } 1.3833 + c.fChar <<= 3; 1.3834 + c.fChar += ch&7; 1.3835 + if (c.fChar <= 255) { 1.3836 + nextCharLL(); 1.3837 + } else { 1.3838 + // The last digit made the number too big. Forget we saw it. 1.3839 + c.fChar >>= 3; 1.3840 + } 1.3841 + } 1.3842 + c.fQuoted = TRUE; 1.3843 + } 1.3844 + else if (peekCharLL() == chQ) { 1.3845 + // "\Q" enter quote mode, which will continue until "\E" 1.3846 + fQuoteMode = TRUE; 1.3847 + nextCharLL(); // discard the 'Q'. 1.3848 + nextChar(c); // recurse to get the real next char. 1.3849 + } 1.3850 + else 1.3851 + { 1.3852 + // We are in a '\' escape that will be handled by the state table scanner. 1.3853 + // Just return the backslash, but remember that the following char is to 1.3854 + // be taken literally. 1.3855 + fInBackslashQuote = TRUE; 1.3856 + } 1.3857 + } 1.3858 + } 1.3859 + 1.3860 + // re-enable # to end-of-line comments, in case they were disabled. 1.3861 + // They are disabled by the parser upon seeing '(?', but this lasts for 1.3862 + // the fetching of the next character only. 1.3863 + fEOLComments = TRUE; 1.3864 + 1.3865 + // putc(c.fChar, stdout); 1.3866 +} 1.3867 + 1.3868 + 1.3869 + 1.3870 +//------------------------------------------------------------------------------ 1.3871 +// 1.3872 +// scanNamedChar 1.3873 + // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 1.3874 +// 1.3875 +// The scan position will be at the 'N'. On return 1.3876 +// the scan position should be just after the '}' 1.3877 +// 1.3878 +// Return the UChar32 1.3879 +// 1.3880 +//------------------------------------------------------------------------------ 1.3881 +UChar32 RegexCompile::scanNamedChar() { 1.3882 + if (U_FAILURE(*fStatus)) { 1.3883 + return 0; 1.3884 + } 1.3885 + 1.3886 + nextChar(fC); 1.3887 + if (fC.fChar != chLBrace) { 1.3888 + error(U_REGEX_PROPERTY_SYNTAX); 1.3889 + return 0; 1.3890 + } 1.3891 + 1.3892 + UnicodeString charName; 1.3893 + for (;;) { 1.3894 + nextChar(fC); 1.3895 + if (fC.fChar == chRBrace) { 1.3896 + break; 1.3897 + } 1.3898 + if (fC.fChar == -1) { 1.3899 + error(U_REGEX_PROPERTY_SYNTAX); 1.3900 + return 0; 1.3901 + } 1.3902 + charName.append(fC.fChar); 1.3903 + } 1.3904 + 1.3905 + char name[100]; 1.3906 + if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 1.3907 + (uint32_t)charName.length()>=sizeof(name)) { 1.3908 + // All Unicode character names have only invariant characters. 1.3909 + // The API to get a character, given a name, accepts only char *, forcing us to convert, 1.3910 + // which requires this error check 1.3911 + error(U_REGEX_PROPERTY_SYNTAX); 1.3912 + return 0; 1.3913 + } 1.3914 + charName.extract(0, charName.length(), name, sizeof(name), US_INV); 1.3915 + 1.3916 + UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 1.3917 + if (U_FAILURE(*fStatus)) { 1.3918 + error(U_REGEX_PROPERTY_SYNTAX); 1.3919 + } 1.3920 + 1.3921 + nextChar(fC); // Continue overall regex pattern processing with char after the '}' 1.3922 + return theChar; 1.3923 +} 1.3924 + 1.3925 +//------------------------------------------------------------------------------ 1.3926 +// 1.3927 +// scanProp Construct a UnicodeSet from the text at the current scan 1.3928 +// position, which will be of the form \p{whaterver} 1.3929 +// 1.3930 +// The scan position will be at the 'p' or 'P'. On return 1.3931 +// the scan position should be just after the '}' 1.3932 +// 1.3933 +// Return a UnicodeSet, constructed from the \P pattern, 1.3934 +// or NULL if the pattern is invalid. 1.3935 +// 1.3936 +//------------------------------------------------------------------------------ 1.3937 +UnicodeSet *RegexCompile::scanProp() { 1.3938 + UnicodeSet *uset = NULL; 1.3939 + 1.3940 + if (U_FAILURE(*fStatus)) { 1.3941 + return NULL; 1.3942 + } 1.3943 + U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 1.3944 + UBool negated = (fC.fChar == chP); 1.3945 + 1.3946 + UnicodeString propertyName; 1.3947 + nextChar(fC); 1.3948 + if (fC.fChar != chLBrace) { 1.3949 + error(U_REGEX_PROPERTY_SYNTAX); 1.3950 + return NULL; 1.3951 + } 1.3952 + for (;;) { 1.3953 + nextChar(fC); 1.3954 + if (fC.fChar == chRBrace) { 1.3955 + break; 1.3956 + } 1.3957 + if (fC.fChar == -1) { 1.3958 + // Hit the end of the input string without finding the closing '}' 1.3959 + error(U_REGEX_PROPERTY_SYNTAX); 1.3960 + return NULL; 1.3961 + } 1.3962 + propertyName.append(fC.fChar); 1.3963 + } 1.3964 + uset = createSetForProperty(propertyName, negated); 1.3965 + nextChar(fC); // Move input scan to position following the closing '}' 1.3966 + return uset; 1.3967 +} 1.3968 + 1.3969 +//------------------------------------------------------------------------------ 1.3970 +// 1.3971 +// scanPosixProp Construct a UnicodeSet from the text at the current scan 1.3972 +// position, which is expected be of the form [:property expression:] 1.3973 +// 1.3974 +// The scan position will be at the opening ':'. On return 1.3975 +// the scan position must be on the closing ']' 1.3976 +// 1.3977 +// Return a UnicodeSet constructed from the pattern, 1.3978 +// or NULL if this is not a valid POSIX-style set expression. 1.3979 +// If not a property expression, restore the initial scan position 1.3980 +// (to the opening ':') 1.3981 +// 1.3982 +// Note: the opening '[:' is not sufficient to guarantee that 1.3983 +// this is a [:property:] expression. 1.3984 +// [:'+=,] is a perfectly good ordinary set expression that 1.3985 +// happens to include ':' as one of its characters. 1.3986 +// 1.3987 +//------------------------------------------------------------------------------ 1.3988 +UnicodeSet *RegexCompile::scanPosixProp() { 1.3989 + UnicodeSet *uset = NULL; 1.3990 + 1.3991 + if (U_FAILURE(*fStatus)) { 1.3992 + return NULL; 1.3993 + } 1.3994 + 1.3995 + U_ASSERT(fC.fChar == chColon); 1.3996 + 1.3997 + // Save the scanner state. 1.3998 + // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 1.3999 + int64_t savedScanIndex = fScanIndex; 1.4000 + int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 1.4001 + UBool savedQuoteMode = fQuoteMode; 1.4002 + UBool savedInBackslashQuote = fInBackslashQuote; 1.4003 + UBool savedEOLComments = fEOLComments; 1.4004 + int64_t savedLineNum = fLineNum; 1.4005 + int64_t savedCharNum = fCharNum; 1.4006 + UChar32 savedLastChar = fLastChar; 1.4007 + UChar32 savedPeekChar = fPeekChar; 1.4008 + RegexPatternChar savedfC = fC; 1.4009 + 1.4010 + // Scan for a closing ]. A little tricky because there are some perverse 1.4011 + // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 1.4012 + // ending on the second closing ]. 1.4013 + 1.4014 + UnicodeString propName; 1.4015 + UBool negated = FALSE; 1.4016 + 1.4017 + // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 1.4018 + nextChar(fC); 1.4019 + if (fC.fChar == chUp) { 1.4020 + negated = TRUE; 1.4021 + nextChar(fC); 1.4022 + } 1.4023 + 1.4024 + // Scan for the closing ":]", collecting the property name along the way. 1.4025 + UBool sawPropSetTerminator = FALSE; 1.4026 + for (;;) { 1.4027 + propName.append(fC.fChar); 1.4028 + nextChar(fC); 1.4029 + if (fC.fQuoted || fC.fChar == -1) { 1.4030 + // Escaped characters or end of input - either says this isn't a [:Property:] 1.4031 + break; 1.4032 + } 1.4033 + if (fC.fChar == chColon) { 1.4034 + nextChar(fC); 1.4035 + if (fC.fChar == chRBracket) { 1.4036 + sawPropSetTerminator = TRUE; 1.4037 + } 1.4038 + break; 1.4039 + } 1.4040 + } 1.4041 + 1.4042 + if (sawPropSetTerminator) { 1.4043 + uset = createSetForProperty(propName, negated); 1.4044 + } 1.4045 + else 1.4046 + { 1.4047 + // No closing ":]". 1.4048 + // Restore the original scan position. 1.4049 + // The main scanner will retry the input as a normal set expression, 1.4050 + // not a [:Property:] expression. 1.4051 + fScanIndex = savedScanIndex; 1.4052 + fQuoteMode = savedQuoteMode; 1.4053 + fInBackslashQuote = savedInBackslashQuote; 1.4054 + fEOLComments = savedEOLComments; 1.4055 + fLineNum = savedLineNum; 1.4056 + fCharNum = savedCharNum; 1.4057 + fLastChar = savedLastChar; 1.4058 + fPeekChar = savedPeekChar; 1.4059 + fC = savedfC; 1.4060 + UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 1.4061 + } 1.4062 + return uset; 1.4063 +} 1.4064 + 1.4065 +static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 1.4066 + set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 1.4067 + addCategory(set, U_GC_CF_MASK, ec); 1.4068 +} 1.4069 + 1.4070 +// 1.4071 +// Create a Unicode Set from a Unicode Property expression. 1.4072 +// This is common code underlying both \p{...} ane [:...:] expressions. 1.4073 +// Includes trying the Java "properties" that aren't supported as 1.4074 +// normal ICU UnicodeSet properties 1.4075 +// 1.4076 +static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 1.4077 +static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 1.4078 +UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 1.4079 + UnicodeString setExpr; 1.4080 + UnicodeSet *set; 1.4081 + uint32_t usetFlags = 0; 1.4082 + 1.4083 + if (U_FAILURE(*fStatus)) { 1.4084 + return NULL; 1.4085 + } 1.4086 + 1.4087 + // 1.4088 + // First try the property as we received it 1.4089 + // 1.4090 + if (negated) { 1.4091 + setExpr.append(negSetPrefix, -1); 1.4092 + } else { 1.4093 + setExpr.append(posSetPrefix, -1); 1.4094 + } 1.4095 + setExpr.append(propName); 1.4096 + setExpr.append(chRBrace); 1.4097 + setExpr.append(chRBracket); 1.4098 + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1.4099 + usetFlags |= USET_CASE_INSENSITIVE; 1.4100 + } 1.4101 + set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 1.4102 + if (U_SUCCESS(*fStatus)) { 1.4103 + return set; 1.4104 + } 1.4105 + delete set; 1.4106 + set = NULL; 1.4107 + 1.4108 + // 1.4109 + // The property as it was didn't work. 1.4110 + 1.4111 + // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX 1.4112 + // or standard Java, but many other regular expression packages do recognize it. 1.4113 + 1.4114 + if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 1.4115 + *fStatus = U_ZERO_ERROR; 1.4116 + set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 1.4117 + if (set == NULL) { 1.4118 + *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.4119 + return set; 1.4120 + } 1.4121 + if (negated) { 1.4122 + set->complement(); 1.4123 + } 1.4124 + return set; 1.4125 + } 1.4126 + 1.4127 + 1.4128 + // Do Java fixes - 1.4129 + // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 1.4130 + // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 1.4131 + // 1.4132 + // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 1.4133 + // is accepted by Java. The property part of the name is compared 1.4134 + // case-insenstively. The spaces must be exactly as shown, either 1.4135 + // all there, or all omitted, with exactly one at each position 1.4136 + // if they are present. From checking against JDK 1.6 1.4137 + // 1.4138 + // This code should be removed when ICU properties support the Java compatibility names 1.4139 + // (ICU 4.0?) 1.4140 + // 1.4141 + UnicodeString mPropName = propName; 1.4142 + if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 1.4143 + mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 1.4144 + } 1.4145 + if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 1.4146 + mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 1.4147 + mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 1.4148 + } 1.4149 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 1.4150 + mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 1.4151 + } 1.4152 + 1.4153 + // See if the property looks like a Java "InBlockName", which 1.4154 + // we will recast as "Block=BlockName" 1.4155 + // 1.4156 + static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 1.4157 + static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 1.4158 + if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 1.4159 + setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 1.4160 + setExpr.append(BLOCK, -1); 1.4161 + setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 1.4162 + setExpr.append(chRBrace); 1.4163 + setExpr.append(chRBracket); 1.4164 + *fStatus = U_ZERO_ERROR; 1.4165 + set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 1.4166 + if (U_SUCCESS(*fStatus)) { 1.4167 + return set; 1.4168 + } 1.4169 + delete set; 1.4170 + set = NULL; 1.4171 + } 1.4172 + 1.4173 + if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 1.4174 + propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 1.4175 + { 1.4176 + UErrorCode localStatus = U_ZERO_ERROR; 1.4177 + //setExpr.remove(); 1.4178 + set = new UnicodeSet(); 1.4179 + // 1.4180 + // Try the various Java specific properties. 1.4181 + // These all begin with "java" 1.4182 + // 1.4183 + if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 1.4184 + addCategory(set, U_GC_CN_MASK, localStatus); 1.4185 + set->complement(); 1.4186 + } 1.4187 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 1.4188 + addCategory(set, U_GC_ND_MASK, localStatus); 1.4189 + } 1.4190 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 1.4191 + addIdentifierIgnorable(set, localStatus); 1.4192 + } 1.4193 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 1.4194 + set->add(0, 0x1F).add(0x7F, 0x9F); 1.4195 + } 1.4196 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 1.4197 + addCategory(set, U_GC_L_MASK, localStatus); 1.4198 + addCategory(set, U_GC_SC_MASK, localStatus); 1.4199 + addCategory(set, U_GC_PC_MASK, localStatus); 1.4200 + addCategory(set, U_GC_ND_MASK, localStatus); 1.4201 + addCategory(set, U_GC_NL_MASK, localStatus); 1.4202 + addCategory(set, U_GC_MC_MASK, localStatus); 1.4203 + addCategory(set, U_GC_MN_MASK, localStatus); 1.4204 + addIdentifierIgnorable(set, localStatus); 1.4205 + } 1.4206 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 1.4207 + addCategory(set, U_GC_L_MASK, localStatus); 1.4208 + addCategory(set, U_GC_NL_MASK, localStatus); 1.4209 + addCategory(set, U_GC_SC_MASK, localStatus); 1.4210 + addCategory(set, U_GC_PC_MASK, localStatus); 1.4211 + } 1.4212 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 1.4213 + addCategory(set, U_GC_L_MASK, localStatus); 1.4214 + } 1.4215 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 1.4216 + addCategory(set, U_GC_L_MASK, localStatus); 1.4217 + addCategory(set, U_GC_ND_MASK, localStatus); 1.4218 + } 1.4219 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 1.4220 + addCategory(set, U_GC_LL_MASK, localStatus); 1.4221 + } 1.4222 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 1.4223 + set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 1.4224 + } 1.4225 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 1.4226 + addCategory(set, U_GC_Z_MASK, localStatus); 1.4227 + } 1.4228 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 1.4229 + set->add(0x10000, UnicodeSet::MAX_VALUE); 1.4230 + } 1.4231 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 1.4232 + addCategory(set, U_GC_LT_MASK, localStatus); 1.4233 + } 1.4234 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 1.4235 + addCategory(set, U_GC_L_MASK, localStatus); 1.4236 + addCategory(set, U_GC_NL_MASK, localStatus); 1.4237 + } 1.4238 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 1.4239 + addCategory(set, U_GC_L_MASK, localStatus); 1.4240 + addCategory(set, U_GC_PC_MASK, localStatus); 1.4241 + addCategory(set, U_GC_ND_MASK, localStatus); 1.4242 + addCategory(set, U_GC_NL_MASK, localStatus); 1.4243 + addCategory(set, U_GC_MC_MASK, localStatus); 1.4244 + addCategory(set, U_GC_MN_MASK, localStatus); 1.4245 + addIdentifierIgnorable(set, localStatus); 1.4246 + } 1.4247 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 1.4248 + addCategory(set, U_GC_LU_MASK, localStatus); 1.4249 + } 1.4250 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 1.4251 + set->add(0, UnicodeSet::MAX_VALUE); 1.4252 + } 1.4253 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 1.4254 + addCategory(set, U_GC_Z_MASK, localStatus); 1.4255 + set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 1.4256 + set->add(9, 0x0d).add(0x1c, 0x1f); 1.4257 + } 1.4258 + else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 1.4259 + set->add(0, UnicodeSet::MAX_VALUE); 1.4260 + } 1.4261 + 1.4262 + if (U_SUCCESS(localStatus) && !set->isEmpty()) { 1.4263 + *fStatus = U_ZERO_ERROR; 1.4264 + if (usetFlags & USET_CASE_INSENSITIVE) { 1.4265 + set->closeOver(USET_CASE_INSENSITIVE); 1.4266 + } 1.4267 + if (negated) { 1.4268 + set->complement(); 1.4269 + } 1.4270 + return set; 1.4271 + } 1.4272 + delete set; 1.4273 + set = NULL; 1.4274 + } 1.4275 + error(*fStatus); 1.4276 + return NULL; 1.4277 +} 1.4278 + 1.4279 + 1.4280 + 1.4281 +// 1.4282 +// SetEval Part of the evaluation of [set expressions]. 1.4283 +// Perform any pending (stacked) operations with precedence 1.4284 +// equal or greater to that of the next operator encountered 1.4285 +// in the expression. 1.4286 +// 1.4287 +void RegexCompile::setEval(int32_t nextOp) { 1.4288 + UnicodeSet *rightOperand = NULL; 1.4289 + UnicodeSet *leftOperand = NULL; 1.4290 + for (;;) { 1.4291 + U_ASSERT(fSetOpStack.empty()==FALSE); 1.4292 + int32_t pendingSetOperation = fSetOpStack.peeki(); 1.4293 + if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 1.4294 + break; 1.4295 + } 1.4296 + fSetOpStack.popi(); 1.4297 + U_ASSERT(fSetStack.empty() == FALSE); 1.4298 + rightOperand = (UnicodeSet *)fSetStack.peek(); 1.4299 + switch (pendingSetOperation) { 1.4300 + case setNegation: 1.4301 + rightOperand->complement(); 1.4302 + break; 1.4303 + case setCaseClose: 1.4304 + // TODO: need a simple close function. Ticket 6065 1.4305 + rightOperand->closeOver(USET_CASE_INSENSITIVE); 1.4306 + rightOperand->removeAllStrings(); 1.4307 + break; 1.4308 + case setDifference1: 1.4309 + case setDifference2: 1.4310 + fSetStack.pop(); 1.4311 + leftOperand = (UnicodeSet *)fSetStack.peek(); 1.4312 + leftOperand->removeAll(*rightOperand); 1.4313 + delete rightOperand; 1.4314 + break; 1.4315 + case setIntersection1: 1.4316 + case setIntersection2: 1.4317 + fSetStack.pop(); 1.4318 + leftOperand = (UnicodeSet *)fSetStack.peek(); 1.4319 + leftOperand->retainAll(*rightOperand); 1.4320 + delete rightOperand; 1.4321 + break; 1.4322 + case setUnion: 1.4323 + fSetStack.pop(); 1.4324 + leftOperand = (UnicodeSet *)fSetStack.peek(); 1.4325 + leftOperand->addAll(*rightOperand); 1.4326 + delete rightOperand; 1.4327 + break; 1.4328 + default: 1.4329 + U_ASSERT(FALSE); 1.4330 + break; 1.4331 + } 1.4332 + } 1.4333 + } 1.4334 + 1.4335 +void RegexCompile::setPushOp(int32_t op) { 1.4336 + setEval(op); 1.4337 + fSetOpStack.push(op, *fStatus); 1.4338 + fSetStack.push(new UnicodeSet(), *fStatus); 1.4339 +} 1.4340 + 1.4341 +U_NAMESPACE_END 1.4342 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1.4343 +