intl/icu/source/i18n/regexcmp.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 //
michael@0 2 // file: regexcmp.cpp
michael@0 3 //
michael@0 4 // Copyright (C) 2002-2013 International Business Machines Corporation and others.
michael@0 5 // All Rights Reserved.
michael@0 6 //
michael@0 7 // This file contains the ICU regular expression compiler, which is responsible
michael@0 8 // for processing a regular expression pattern into the compiled form that
michael@0 9 // is used by the match finding engine.
michael@0 10 //
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13
michael@0 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 15
michael@0 16 #include "unicode/ustring.h"
michael@0 17 #include "unicode/unistr.h"
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/uchar.h"
michael@0 20 #include "unicode/uchriter.h"
michael@0 21 #include "unicode/parsepos.h"
michael@0 22 #include "unicode/parseerr.h"
michael@0 23 #include "unicode/regex.h"
michael@0 24 #include "unicode/utf.h"
michael@0 25 #include "unicode/utf16.h"
michael@0 26 #include "patternprops.h"
michael@0 27 #include "putilimp.h"
michael@0 28 #include "cmemory.h"
michael@0 29 #include "cstring.h"
michael@0 30 #include "uvectr32.h"
michael@0 31 #include "uvectr64.h"
michael@0 32 #include "uassert.h"
michael@0 33 #include "ucln_in.h"
michael@0 34 #include "uinvchar.h"
michael@0 35
michael@0 36 #include "regeximp.h"
michael@0 37 #include "regexcst.h" // Contains state table for the regex pattern parser.
michael@0 38 // generated by a Perl script.
michael@0 39 #include "regexcmp.h"
michael@0 40 #include "regexst.h"
michael@0 41 #include "regextxt.h"
michael@0 42
michael@0 43
michael@0 44
michael@0 45 U_NAMESPACE_BEGIN
michael@0 46
michael@0 47
michael@0 48 //------------------------------------------------------------------------------
michael@0 49 //
michael@0 50 // Constructor.
michael@0 51 //
michael@0 52 //------------------------------------------------------------------------------
michael@0 53 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
michael@0 54 fParenStack(status), fSetStack(status), fSetOpStack(status)
michael@0 55 {
michael@0 56 // Lazy init of all shared global sets (needed for init()'s empty text)
michael@0 57 RegexStaticSets::initGlobals(&status);
michael@0 58
michael@0 59 fStatus = &status;
michael@0 60
michael@0 61 fRXPat = rxp;
michael@0 62 fScanIndex = 0;
michael@0 63 fLastChar = -1;
michael@0 64 fPeekChar = -1;
michael@0 65 fLineNum = 1;
michael@0 66 fCharNum = 0;
michael@0 67 fQuoteMode = FALSE;
michael@0 68 fInBackslashQuote = FALSE;
michael@0 69 fModeFlags = fRXPat->fFlags | 0x80000000;
michael@0 70 fEOLComments = TRUE;
michael@0 71
michael@0 72 fMatchOpenParen = -1;
michael@0 73 fMatchCloseParen = -1;
michael@0 74
michael@0 75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
michael@0 76 status = rxp->fDeferredStatus;
michael@0 77 }
michael@0 78 }
michael@0 79
michael@0 80 static const UChar chAmp = 0x26; // '&'
michael@0 81 static const UChar chDash = 0x2d; // '-'
michael@0 82
michael@0 83
michael@0 84 //------------------------------------------------------------------------------
michael@0 85 //
michael@0 86 // Destructor
michael@0 87 //
michael@0 88 //------------------------------------------------------------------------------
michael@0 89 RegexCompile::~RegexCompile() {
michael@0 90 }
michael@0 91
michael@0 92 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
michael@0 93 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
michael@0 94 }
michael@0 95
michael@0 96 //------------------------------------------------------------------------------
michael@0 97 //
michael@0 98 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
michael@0 99 // The state tables are hand-written in the file regexcst.txt,
michael@0 100 // and converted to the form used here by a perl
michael@0 101 // script regexcst.pl
michael@0 102 //
michael@0 103 //------------------------------------------------------------------------------
michael@0 104 void RegexCompile::compile(
michael@0 105 const UnicodeString &pat, // Source pat to be compiled.
michael@0 106 UParseError &pp, // Error position info
michael@0 107 UErrorCode &e) // Error Code
michael@0 108 {
michael@0 109 fRXPat->fPatternString = new UnicodeString(pat);
michael@0 110 UText patternText = UTEXT_INITIALIZER;
michael@0 111 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
michael@0 112
michael@0 113 if (U_SUCCESS(e)) {
michael@0 114 compile(&patternText, pp, e);
michael@0 115 utext_close(&patternText);
michael@0 116 }
michael@0 117 }
michael@0 118
michael@0 119 //
michael@0 120 // compile, UText mode
michael@0 121 // All the work is actually done here.
michael@0 122 //
michael@0 123 void RegexCompile::compile(
michael@0 124 UText *pat, // Source pat to be compiled.
michael@0 125 UParseError &pp, // Error position info
michael@0 126 UErrorCode &e) // Error Code
michael@0 127 {
michael@0 128 fStatus = &e;
michael@0 129 fParseErr = &pp;
michael@0 130 fStackPtr = 0;
michael@0 131 fStack[fStackPtr] = 0;
michael@0 132
michael@0 133 if (U_FAILURE(*fStatus)) {
michael@0 134 return;
michael@0 135 }
michael@0 136
michael@0 137 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
michael@0 138 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
michael@0 139
michael@0 140 // Prepare the RegexPattern object to receive the compiled pattern.
michael@0 141 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
michael@0 142 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
michael@0 143 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
michael@0 144
michael@0 145
michael@0 146 // Initialize the pattern scanning state machine
michael@0 147 fPatternLength = utext_nativeLength(pat);
michael@0 148 uint16_t state = 1;
michael@0 149 const RegexTableEl *tableEl;
michael@0 150
michael@0 151 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
michael@0 152 if (fModeFlags & UREGEX_LITERAL) {
michael@0 153 fQuoteMode = TRUE;
michael@0 154 }
michael@0 155
michael@0 156 nextChar(fC); // Fetch the first char from the pattern string.
michael@0 157
michael@0 158 //
michael@0 159 // Main loop for the regex pattern parsing state machine.
michael@0 160 // Runs once per state transition.
michael@0 161 // Each time through optionally performs, depending on the state table,
michael@0 162 // - an advance to the the next pattern char
michael@0 163 // - an action to be performed.
michael@0 164 // - pushing or popping a state to/from the local state return stack.
michael@0 165 // file regexcst.txt is the source for the state table. The logic behind
michael@0 166 // recongizing the pattern syntax is there, not here.
michael@0 167 //
michael@0 168 for (;;) {
michael@0 169 // Bail out if anything has gone wrong.
michael@0 170 // Regex pattern parsing stops on the first error encountered.
michael@0 171 if (U_FAILURE(*fStatus)) {
michael@0 172 break;
michael@0 173 }
michael@0 174
michael@0 175 U_ASSERT(state != 0);
michael@0 176
michael@0 177 // Find the state table element that matches the input char from the pattern, or the
michael@0 178 // class of the input character. Start with the first table row for this
michael@0 179 // state, then linearly scan forward until we find a row that matches the
michael@0 180 // character. The last row for each state always matches all characters, so
michael@0 181 // the search will stop there, if not before.
michael@0 182 //
michael@0 183 tableEl = &gRuleParseStateTable[state];
michael@0 184 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
michael@0 185 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
michael@0 186
michael@0 187 for (;;) { // loop through table rows belonging to this state, looking for one
michael@0 188 // that matches the current input char.
michael@0 189 REGEX_SCAN_DEBUG_PRINTF(("."));
michael@0 190 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
michael@0 191 // Table row specified an individual character, not a set, and
michael@0 192 // the input character is not quoted, and
michael@0 193 // the input character matched it.
michael@0 194 break;
michael@0 195 }
michael@0 196 if (tableEl->fCharClass == 255) {
michael@0 197 // Table row specified default, match anything character class.
michael@0 198 break;
michael@0 199 }
michael@0 200 if (tableEl->fCharClass == 254 && fC.fQuoted) {
michael@0 201 // Table row specified "quoted" and the char was quoted.
michael@0 202 break;
michael@0 203 }
michael@0 204 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
michael@0 205 // Table row specified eof and we hit eof on the input.
michael@0 206 break;
michael@0 207 }
michael@0 208
michael@0 209 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
michael@0 210 fC.fQuoted == FALSE && // char is not escaped &&
michael@0 211 fC.fChar != (UChar32)-1) { // char is not EOF
michael@0 212 U_ASSERT(tableEl->fCharClass <= 137);
michael@0 213 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
michael@0 214 // Table row specified a character class, or set of characters,
michael@0 215 // and the current char matches it.
michael@0 216 break;
michael@0 217 }
michael@0 218 }
michael@0 219
michael@0 220 // No match on this row, advance to the next row for this state,
michael@0 221 tableEl++;
michael@0 222 }
michael@0 223 REGEX_SCAN_DEBUG_PRINTF(("\n"));
michael@0 224
michael@0 225 //
michael@0 226 // We've found the row of the state table that matches the current input
michael@0 227 // character from the rules string.
michael@0 228 // Perform any action specified by this row in the state table.
michael@0 229 if (doParseActions(tableEl->fAction) == FALSE) {
michael@0 230 // Break out of the state machine loop if the
michael@0 231 // the action signalled some kind of error, or
michael@0 232 // the action was to exit, occurs on normal end-of-rules-input.
michael@0 233 break;
michael@0 234 }
michael@0 235
michael@0 236 if (tableEl->fPushState != 0) {
michael@0 237 fStackPtr++;
michael@0 238 if (fStackPtr >= kStackSize) {
michael@0 239 error(U_REGEX_INTERNAL_ERROR);
michael@0 240 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
michael@0 241 fStackPtr--;
michael@0 242 }
michael@0 243 fStack[fStackPtr] = tableEl->fPushState;
michael@0 244 }
michael@0 245
michael@0 246 //
michael@0 247 // NextChar. This is where characters are actually fetched from the pattern.
michael@0 248 // Happens under control of the 'n' tag in the state table.
michael@0 249 //
michael@0 250 if (tableEl->fNextChar) {
michael@0 251 nextChar(fC);
michael@0 252 }
michael@0 253
michael@0 254 // Get the next state from the table entry, or from the
michael@0 255 // state stack if the next state was specified as "pop".
michael@0 256 if (tableEl->fNextState != 255) {
michael@0 257 state = tableEl->fNextState;
michael@0 258 } else {
michael@0 259 state = fStack[fStackPtr];
michael@0 260 fStackPtr--;
michael@0 261 if (fStackPtr < 0) {
michael@0 262 // state stack underflow
michael@0 263 // This will occur if the user pattern has mis-matched parentheses,
michael@0 264 // with extra close parens.
michael@0 265 //
michael@0 266 fStackPtr++;
michael@0 267 error(U_REGEX_MISMATCHED_PAREN);
michael@0 268 }
michael@0 269 }
michael@0 270
michael@0 271 }
michael@0 272
michael@0 273 if (U_FAILURE(*fStatus)) {
michael@0 274 // Bail out if the pattern had errors.
michael@0 275 // Set stack cleanup: a successful compile would have left it empty,
michael@0 276 // but errors can leave temporary sets hanging around.
michael@0 277 while (!fSetStack.empty()) {
michael@0 278 delete (UnicodeSet *)fSetStack.pop();
michael@0 279 }
michael@0 280 return;
michael@0 281 }
michael@0 282
michael@0 283 //
michael@0 284 // The pattern has now been read and processed, and the compiled code generated.
michael@0 285 //
michael@0 286
michael@0 287 //
michael@0 288 // Compute the number of digits requried for the largest capture group number.
michael@0 289 //
michael@0 290 fRXPat->fMaxCaptureDigits = 1;
michael@0 291 int32_t n = 10;
michael@0 292 int32_t groupCount = fRXPat->fGroupMap->size();
michael@0 293 while (n <= groupCount) {
michael@0 294 fRXPat->fMaxCaptureDigits++;
michael@0 295 n *= 10;
michael@0 296 }
michael@0 297
michael@0 298 //
michael@0 299 // The pattern's fFrameSize so far has accumulated the requirements for
michael@0 300 // storage for capture parentheses, counters, etc. that are encountered
michael@0 301 // in the pattern. Add space for the two variables that are always
michael@0 302 // present in the saved state: the input string position (int64_t) and
michael@0 303 // the position in the compiled pattern.
michael@0 304 //
michael@0 305 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
michael@0 306
michael@0 307 //
michael@0 308 // Optimization pass 1: NOPs, back-references, and case-folding
michael@0 309 //
michael@0 310 stripNOPs();
michael@0 311
michael@0 312 //
michael@0 313 // Get bounds for the minimum and maximum length of a string that this
michael@0 314 // pattern can match. Used to avoid looking for matches in strings that
michael@0 315 // are too short.
michael@0 316 //
michael@0 317 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
michael@0 318
michael@0 319 //
michael@0 320 // Optimization pass 2: match start type
michael@0 321 //
michael@0 322 matchStartType();
michael@0 323
michael@0 324 //
michael@0 325 // Set up fast latin-1 range sets
michael@0 326 //
michael@0 327 int32_t numSets = fRXPat->fSets->size();
michael@0 328 fRXPat->fSets8 = new Regex8BitSet[numSets];
michael@0 329 // Null pointer check.
michael@0 330 if (fRXPat->fSets8 == NULL) {
michael@0 331 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 332 return;
michael@0 333 }
michael@0 334 int32_t i;
michael@0 335 for (i=0; i<numSets; i++) {
michael@0 336 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
michael@0 337 fRXPat->fSets8[i].init(s);
michael@0 338 }
michael@0 339
michael@0 340 }
michael@0 341
michael@0 342
michael@0 343
michael@0 344
michael@0 345
michael@0 346 //------------------------------------------------------------------------------
michael@0 347 //
michael@0 348 // doParseAction Do some action during regex pattern parsing.
michael@0 349 // Called by the parse state machine.
michael@0 350 //
michael@0 351 // Generation of the match engine PCode happens here, or
michael@0 352 // in functions called from the parse actions defined here.
michael@0 353 //
michael@0 354 //
michael@0 355 //------------------------------------------------------------------------------
michael@0 356 UBool RegexCompile::doParseActions(int32_t action)
michael@0 357 {
michael@0 358 UBool returnVal = TRUE;
michael@0 359
michael@0 360 switch ((Regex_PatternParseAction)action) {
michael@0 361
michael@0 362 case doPatStart:
michael@0 363 // Start of pattern compiles to:
michael@0 364 //0 SAVE 2 Fall back to position of FAIL
michael@0 365 //1 jmp 3
michael@0 366 //2 FAIL Stop if we ever reach here.
michael@0 367 //3 NOP Dummy, so start of pattern looks the same as
michael@0 368 // the start of an ( grouping.
michael@0 369 //4 NOP Resreved, will be replaced by a save if there are
michael@0 370 // OR | operators at the top level
michael@0 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
michael@0 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
michael@0 373 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
michael@0 374
michael@0 375 // Standard open nonCapture paren action emits the two NOPs and
michael@0 376 // sets up the paren stack frame.
michael@0 377 doParseActions(doOpenNonCaptureParen);
michael@0 378 break;
michael@0 379
michael@0 380 case doPatFinish:
michael@0 381 // We've scanned to the end of the pattern
michael@0 382 // The end of pattern compiles to:
michael@0 383 // URX_END
michael@0 384 // which will stop the runtime match engine.
michael@0 385 // Encountering end of pattern also behaves like a close paren,
michael@0 386 // and forces fixups of the State Save at the beginning of the compiled pattern
michael@0 387 // and of any OR operations at the top level.
michael@0 388 //
michael@0 389 handleCloseParen();
michael@0 390 if (fParenStack.size() > 0) {
michael@0 391 // Missing close paren in pattern.
michael@0 392 error(U_REGEX_MISMATCHED_PAREN);
michael@0 393 }
michael@0 394
michael@0 395 // add the END operation to the compiled pattern.
michael@0 396 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
michael@0 397
michael@0 398 // Terminate the pattern compilation state machine.
michael@0 399 returnVal = FALSE;
michael@0 400 break;
michael@0 401
michael@0 402
michael@0 403
michael@0 404 case doOrOperator:
michael@0 405 // Scanning a '|', as in (A|B)
michael@0 406 {
michael@0 407 // Generate code for any pending literals preceding the '|'
michael@0 408 fixLiterals(FALSE);
michael@0 409
michael@0 410 // Insert a SAVE operation at the start of the pattern section preceding
michael@0 411 // this OR at this level. This SAVE will branch the match forward
michael@0 412 // to the right hand side of the OR in the event that the left hand
michael@0 413 // side fails to match and backtracks. Locate the position for the
michael@0 414 // save from the location on the top of the parentheses stack.
michael@0 415 int32_t savePosition = fParenStack.popi();
michael@0 416 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
michael@0 417 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
michael@0 418 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
michael@0 419 fRXPat->fCompiledPat->setElementAt(op, savePosition);
michael@0 420
michael@0 421 // Append an JMP operation into the compiled pattern. The operand for
michael@0 422 // the JMP will eventually be the location following the ')' for the
michael@0 423 // group. This will be patched in later, when the ')' is encountered.
michael@0 424 op = URX_BUILD(URX_JMP, 0);
michael@0 425 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 426
michael@0 427 // Push the position of the newly added JMP op onto the parentheses stack.
michael@0 428 // This registers if for fixup when this block's close paren is encountered.
michael@0 429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
michael@0 430
michael@0 431 // Append a NOP to the compiled pattern. This is the slot reserved
michael@0 432 // for a SAVE in the event that there is yet another '|' following
michael@0 433 // this one.
michael@0 434 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 435 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
michael@0 436 }
michael@0 437 break;
michael@0 438
michael@0 439
michael@0 440 case doOpenCaptureParen:
michael@0 441 // Open Paren.
michael@0 442 // Compile to a
michael@0 443 // - NOP, which later may be replaced by a save-state if the
michael@0 444 // parenthesized group gets a * quantifier, followed by
michael@0 445 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
michael@0 446 // - NOP, which may later be replaced by a save-state if there
michael@0 447 // is an '|' alternation within the parens.
michael@0 448 //
michael@0 449 // Each capture group gets three slots in the save stack frame:
michael@0 450 // 0: Capture Group start position (in input string being matched.)
michael@0 451 // 1: Capture Group end position.
michael@0 452 // 2: Start of Match-in-progress.
michael@0 453 // The first two locations are for a completed capture group, and are
michael@0 454 // referred to by back references and the like.
michael@0 455 // The third location stores the capture start position when an START_CAPTURE is
michael@0 456 // encountered. This will be promoted to a completed capture when (and if) the corresponding
michael@0 457 // END_CAPTURE is encountered.
michael@0 458 {
michael@0 459 fixLiterals();
michael@0 460 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 461 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
michael@0 462 fRXPat->fFrameSize += 3;
michael@0 463 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
michael@0 464 fRXPat->fCompiledPat->addElement(cop, *fStatus);
michael@0 465 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 466
michael@0 467 // On the Parentheses stack, start a new frame and add the postions
michael@0 468 // of the two NOPs. Depending on what follows in the pattern, the
michael@0 469 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
michael@0 470 // address of the end of the parenthesized group.
michael@0 471 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 472 fParenStack.push(capturing, *fStatus); // Frame type.
michael@0 473 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
michael@0 474 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
michael@0 475
michael@0 476 // Save the mapping from group number to stack frame variable position.
michael@0 477 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
michael@0 478 }
michael@0 479 break;
michael@0 480
michael@0 481 case doOpenNonCaptureParen:
michael@0 482 // Open non-caputuring (grouping only) Paren.
michael@0 483 // Compile to a
michael@0 484 // - NOP, which later may be replaced by a save-state if the
michael@0 485 // parenthesized group gets a * quantifier, followed by
michael@0 486 // - NOP, which may later be replaced by a save-state if there
michael@0 487 // is an '|' alternation within the parens.
michael@0 488 {
michael@0 489 fixLiterals();
michael@0 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 491 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 492
michael@0 493 // On the Parentheses stack, start a new frame and add the postions
michael@0 494 // of the two NOPs.
michael@0 495 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 496 fParenStack.push(plain, *fStatus); // Begin a new frame.
michael@0 497 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
michael@0 498 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
michael@0 499 }
michael@0 500 break;
michael@0 501
michael@0 502
michael@0 503 case doOpenAtomicParen:
michael@0 504 // Open Atomic Paren. (?>
michael@0 505 // Compile to a
michael@0 506 // - NOP, which later may be replaced if the parenthesized group
michael@0 507 // has a quantifier, followed by
michael@0 508 // - STO_SP save state stack position, so it can be restored at the ")"
michael@0 509 // - NOP, which may later be replaced by a save-state if there
michael@0 510 // is an '|' alternation within the parens.
michael@0 511 {
michael@0 512 fixLiterals();
michael@0 513 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 514 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
michael@0 515 fRXPat->fDataSize += 1; // state stack ptr.
michael@0 516 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
michael@0 517 fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
michael@0 518 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 519
michael@0 520 // On the Parentheses stack, start a new frame and add the postions
michael@0 521 // of the two NOPs. Depending on what follows in the pattern, the
michael@0 522 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
michael@0 523 // address of the end of the parenthesized group.
michael@0 524 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 525 fParenStack.push(atomic, *fStatus); // Frame type.
michael@0 526 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
michael@0 527 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
michael@0 528 }
michael@0 529 break;
michael@0 530
michael@0 531
michael@0 532 case doOpenLookAhead:
michael@0 533 // Positive Look-ahead (?= stuff )
michael@0 534 //
michael@0 535 // Note: Addition of transparent input regions, with the need to
michael@0 536 // restore the original regions when failing out of a lookahead
michael@0 537 // block, complicated this sequence. Some conbined opcodes
michael@0 538 // might make sense - or might not, lookahead aren't that common.
michael@0 539 //
michael@0 540 // Caution: min match length optimization knows about this
michael@0 541 // sequence; don't change without making updates there too.
michael@0 542 //
michael@0 543 // Compiles to
michael@0 544 // 1 START_LA dataLoc Saves SP, Input Pos
michael@0 545 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
michael@0 546 // 3 JMP 6 continue ...
michael@0 547 //
michael@0 548 // 4. LA_END Look Ahead failed. Restore regions.
michael@0 549 // 5. BACKTRACK and back track again.
michael@0 550 //
michael@0 551 // 6. NOP reserved for use by quantifiers on the block.
michael@0 552 // Look-ahead can't have quantifiers, but paren stack
michael@0 553 // compile time conventions require the slot anyhow.
michael@0 554 // 7. NOP may be replaced if there is are '|' ops in the block.
michael@0 555 // 8. code for parenthesized stuff.
michael@0 556 // 9. LA_END
michael@0 557 //
michael@0 558 // Two data slots are reserved, for saving the stack ptr and the input position.
michael@0 559 {
michael@0 560 fixLiterals();
michael@0 561 int32_t dataLoc = fRXPat->fDataSize;
michael@0 562 fRXPat->fDataSize += 2;
michael@0 563 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
michael@0 564 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 565
michael@0 566 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
michael@0 567 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 568
michael@0 569 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
michael@0 570 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 571
michael@0 572 op = URX_BUILD(URX_LA_END, dataLoc);
michael@0 573 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 574
michael@0 575 op = URX_BUILD(URX_BACKTRACK, 0);
michael@0 576 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 577
michael@0 578 op = URX_BUILD(URX_NOP, 0);
michael@0 579 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 580 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 581
michael@0 582 // On the Parentheses stack, start a new frame and add the postions
michael@0 583 // of the NOPs.
michael@0 584 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 585 fParenStack.push(lookAhead, *fStatus); // Frame type.
michael@0 586 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
michael@0 587 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
michael@0 588 }
michael@0 589 break;
michael@0 590
michael@0 591 case doOpenLookAheadNeg:
michael@0 592 // Negated Lookahead. (?! stuff )
michael@0 593 // Compiles to
michael@0 594 // 1. START_LA dataloc
michael@0 595 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
michael@0 596 // // which continues with the match.
michael@0 597 // 3. NOP // Std. Open Paren sequence, for possible '|'
michael@0 598 // 4. code for parenthesized stuff.
michael@0 599 // 5. END_LA // Cut back stack, remove saved state from step 2.
michael@0 600 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
michael@0 601 // 7. END_LA // Restore match region, in case look-ahead was using
michael@0 602 // an alternate (transparent) region.
michael@0 603 {
michael@0 604 fixLiterals();
michael@0 605 int32_t dataLoc = fRXPat->fDataSize;
michael@0 606 fRXPat->fDataSize += 2;
michael@0 607 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
michael@0 608 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 609
michael@0 610 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
michael@0 611 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 612
michael@0 613 op = URX_BUILD(URX_NOP, 0);
michael@0 614 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 615
michael@0 616 // On the Parentheses stack, start a new frame and add the postions
michael@0 617 // of the StateSave and NOP.
michael@0 618 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 619 fParenStack.push(negLookAhead, *fStatus); // Frame type
michael@0 620 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
michael@0 621 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
michael@0 622
michael@0 623 // Instructions #5 - #7 will be added when the ')' is encountered.
michael@0 624 }
michael@0 625 break;
michael@0 626
michael@0 627 case doOpenLookBehind:
michael@0 628 {
michael@0 629 // Compile a (?<= look-behind open paren.
michael@0 630 //
michael@0 631 // Compiles to
michael@0 632 // 0 URX_LB_START dataLoc
michael@0 633 // 1 URX_LB_CONT dataLoc
michael@0 634 // 2 MinMatchLen
michael@0 635 // 3 MaxMatchLen
michael@0 636 // 4 URX_NOP Standard '(' boilerplate.
michael@0 637 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
michael@0 638 // 6 <code for LookBehind expression>
michael@0 639 // 7 URX_LB_END dataLoc # Check match len, restore input len
michael@0 640 // 8 URX_LA_END dataLoc # Restore stack, input pos
michael@0 641 //
michael@0 642 // Allocate a block of matcher data, to contain (when running a match)
michael@0 643 // 0: Stack ptr on entry
michael@0 644 // 1: Input Index on entry
michael@0 645 // 2: Start index of match current match attempt.
michael@0 646 // 3: Original Input String len.
michael@0 647
michael@0 648 // Generate match code for any pending literals.
michael@0 649 fixLiterals();
michael@0 650
michael@0 651 // Allocate data space
michael@0 652 int32_t dataLoc = fRXPat->fDataSize;
michael@0 653 fRXPat->fDataSize += 4;
michael@0 654
michael@0 655 // Emit URX_LB_START
michael@0 656 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
michael@0 657 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 658
michael@0 659 // Emit URX_LB_CONT
michael@0 660 op = URX_BUILD(URX_LB_CONT, dataLoc);
michael@0 661 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
michael@0 663 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
michael@0 664
michael@0 665 // Emit the NOP
michael@0 666 op = URX_BUILD(URX_NOP, 0);
michael@0 667 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 668 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 669
michael@0 670 // On the Parentheses stack, start a new frame and add the postions
michael@0 671 // of the URX_LB_CONT and the NOP.
michael@0 672 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 673 fParenStack.push(lookBehind, *fStatus); // Frame type
michael@0 674 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
michael@0 675 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
michael@0 676
michael@0 677 // The final two instructions will be added when the ')' is encountered.
michael@0 678 }
michael@0 679
michael@0 680 break;
michael@0 681
michael@0 682 case doOpenLookBehindNeg:
michael@0 683 {
michael@0 684 // Compile a (?<! negated look-behind open paren.
michael@0 685 //
michael@0 686 // Compiles to
michael@0 687 // 0 URX_LB_START dataLoc # Save entry stack, input len
michael@0 688 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
michael@0 689 // 2 MinMatchLen
michael@0 690 // 3 MaxMatchLen
michael@0 691 // 4 continueLoc (9)
michael@0 692 // 5 URX_NOP Standard '(' boilerplate.
michael@0 693 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
michael@0 694 // 7 <code for LookBehind expression>
michael@0 695 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
michael@0 696 // 9 ...
michael@0 697 //
michael@0 698 // Allocate a block of matcher data, to contain (when running a match)
michael@0 699 // 0: Stack ptr on entry
michael@0 700 // 1: Input Index on entry
michael@0 701 // 2: Start index of match current match attempt.
michael@0 702 // 3: Original Input String len.
michael@0 703
michael@0 704 // Generate match code for any pending literals.
michael@0 705 fixLiterals();
michael@0 706
michael@0 707 // Allocate data space
michael@0 708 int32_t dataLoc = fRXPat->fDataSize;
michael@0 709 fRXPat->fDataSize += 4;
michael@0 710
michael@0 711 // Emit URX_LB_START
michael@0 712 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
michael@0 713 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 714
michael@0 715 // Emit URX_LBN_CONT
michael@0 716 op = URX_BUILD(URX_LBN_CONT, dataLoc);
michael@0 717 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
michael@0 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
michael@0 720 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
michael@0 721
michael@0 722 // Emit the NOP
michael@0 723 op = URX_BUILD(URX_NOP, 0);
michael@0 724 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 725 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 726
michael@0 727 // On the Parentheses stack, start a new frame and add the postions
michael@0 728 // of the URX_LB_CONT and the NOP.
michael@0 729 fParenStack.push(fModeFlags, *fStatus); // Match mode state
michael@0 730 fParenStack.push(lookBehindN, *fStatus); // Frame type
michael@0 731 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
michael@0 732 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
michael@0 733
michael@0 734 // The final two instructions will be added when the ')' is encountered.
michael@0 735 }
michael@0 736 break;
michael@0 737
michael@0 738 case doConditionalExpr:
michael@0 739 // Conditionals such as (?(1)a:b)
michael@0 740 case doPerlInline:
michael@0 741 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
michael@0 742 error(U_REGEX_UNIMPLEMENTED);
michael@0 743 break;
michael@0 744
michael@0 745
michael@0 746 case doCloseParen:
michael@0 747 handleCloseParen();
michael@0 748 if (fParenStack.size() <= 0) {
michael@0 749 // Extra close paren, or missing open paren.
michael@0 750 error(U_REGEX_MISMATCHED_PAREN);
michael@0 751 }
michael@0 752 break;
michael@0 753
michael@0 754 case doNOP:
michael@0 755 break;
michael@0 756
michael@0 757
michael@0 758 case doBadOpenParenType:
michael@0 759 case doRuleError:
michael@0 760 error(U_REGEX_RULE_SYNTAX);
michael@0 761 break;
michael@0 762
michael@0 763
michael@0 764 case doMismatchedParenErr:
michael@0 765 error(U_REGEX_MISMATCHED_PAREN);
michael@0 766 break;
michael@0 767
michael@0 768 case doPlus:
michael@0 769 // Normal '+' compiles to
michael@0 770 // 1. stuff to be repeated (already built)
michael@0 771 // 2. jmp-sav 1
michael@0 772 // 3. ...
michael@0 773 //
michael@0 774 // Or, if the item to be repeated can match a zero length string,
michael@0 775 // 1. STO_INP_LOC data-loc
michael@0 776 // 2. body of stuff to be repeated
michael@0 777 // 3. JMP_SAV_X 2
michael@0 778 // 4. ...
michael@0 779
michael@0 780 //
michael@0 781 // Or, if the item to be repeated is simple
michael@0 782 // 1. Item to be repeated.
michael@0 783 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
michael@0 784 // 3. LOOP_C stack location
michael@0 785 {
michael@0 786 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
michael@0 787 int32_t frameLoc;
michael@0 788
michael@0 789 // Check for simple constructs, which may get special optimized code.
michael@0 790 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
michael@0 791 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
michael@0 792
michael@0 793 if (URX_TYPE(repeatedOp) == URX_SETREF) {
michael@0 794 // Emit optimized code for [char set]+
michael@0 795 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
michael@0 796 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
michael@0 797 frameLoc = fRXPat->fFrameSize;
michael@0 798 fRXPat->fFrameSize++;
michael@0 799 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
michael@0 800 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
michael@0 801 break;
michael@0 802 }
michael@0 803
michael@0 804 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
michael@0 805 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
michael@0 806 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
michael@0 807 // Emit Optimized code for .+ operations.
michael@0 808 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
michael@0 809 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
michael@0 810 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
michael@0 811 loopOpI |= 1;
michael@0 812 }
michael@0 813 if (fModeFlags & UREGEX_UNIX_LINES) {
michael@0 814 loopOpI |= 2;
michael@0 815 }
michael@0 816 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
michael@0 817 frameLoc = fRXPat->fFrameSize;
michael@0 818 fRXPat->fFrameSize++;
michael@0 819 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
michael@0 820 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
michael@0 821 break;
michael@0 822 }
michael@0 823
michael@0 824 }
michael@0 825
michael@0 826 // General case.
michael@0 827
michael@0 828 // Check for minimum match length of zero, which requires
michael@0 829 // extra loop-breaking code.
michael@0 830 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
michael@0 831 // Zero length match is possible.
michael@0 832 // Emit the code sequence that can handle it.
michael@0 833 insertOp(topLoc);
michael@0 834 frameLoc = fRXPat->fFrameSize;
michael@0 835 fRXPat->fFrameSize++;
michael@0 836
michael@0 837 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
michael@0 838 fRXPat->fCompiledPat->setElementAt(op, topLoc);
michael@0 839
michael@0 840 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
michael@0 841 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 842 } else {
michael@0 843 // Simpler code when the repeated body must match something non-empty
michael@0 844 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
michael@0 845 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
michael@0 846 }
michael@0 847 }
michael@0 848 break;
michael@0 849
michael@0 850 case doNGPlus:
michael@0 851 // Non-greedy '+?' compiles to
michael@0 852 // 1. stuff to be repeated (already built)
michael@0 853 // 2. state-save 1
michael@0 854 // 3. ...
michael@0 855 {
michael@0 856 int32_t topLoc = blockTopLoc(FALSE);
michael@0 857 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
michael@0 858 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
michael@0 859 }
michael@0 860 break;
michael@0 861
michael@0 862
michael@0 863 case doOpt:
michael@0 864 // Normal (greedy) ? quantifier.
michael@0 865 // Compiles to
michael@0 866 // 1. state save 3
michael@0 867 // 2. body of optional block
michael@0 868 // 3. ...
michael@0 869 // Insert the state save into the compiled pattern, and we're done.
michael@0 870 {
michael@0 871 int32_t saveStateLoc = blockTopLoc(TRUE);
michael@0 872 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
michael@0 873 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
michael@0 874 }
michael@0 875 break;
michael@0 876
michael@0 877 case doNGOpt:
michael@0 878 // Non-greedy ?? quantifier
michael@0 879 // compiles to
michael@0 880 // 1. jmp 4
michael@0 881 // 2. body of optional block
michael@0 882 // 3 jmp 5
michael@0 883 // 4. state save 2
michael@0 884 // 5 ...
michael@0 885 // This code is less than ideal, with two jmps instead of one, because we can only
michael@0 886 // insert one instruction at the top of the block being iterated.
michael@0 887 {
michael@0 888 int32_t jmp1_loc = blockTopLoc(TRUE);
michael@0 889 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
michael@0 890
michael@0 891 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
michael@0 892 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
michael@0 893
michael@0 894 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
michael@0 895 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
michael@0 896
michael@0 897 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
michael@0 898 fRXPat->fCompiledPat->addElement(save_op, *fStatus);
michael@0 899 }
michael@0 900 break;
michael@0 901
michael@0 902
michael@0 903 case doStar:
michael@0 904 // Normal (greedy) * quantifier.
michael@0 905 // Compiles to
michael@0 906 // 1. STATE_SAVE 4
michael@0 907 // 2. body of stuff being iterated over
michael@0 908 // 3. JMP_SAV 2
michael@0 909 // 4. ...
michael@0 910 //
michael@0 911 // Or, if the body is a simple [Set],
michael@0 912 // 1. LOOP_SR_I set number
michael@0 913 // 2. LOOP_C stack location
michael@0 914 // ...
michael@0 915 //
michael@0 916 // Or if this is a .*
michael@0 917 // 1. LOOP_DOT_I (. matches all mode flag)
michael@0 918 // 2. LOOP_C stack location
michael@0 919 //
michael@0 920 // Or, if the body can match a zero-length string, to inhibit infinite loops,
michael@0 921 // 1. STATE_SAVE 5
michael@0 922 // 2. STO_INP_LOC data-loc
michael@0 923 // 3. body of stuff
michael@0 924 // 4. JMP_SAV_X 2
michael@0 925 // 5. ...
michael@0 926 {
michael@0 927 // location of item #1, the STATE_SAVE
michael@0 928 int32_t topLoc = blockTopLoc(FALSE);
michael@0 929 int32_t dataLoc = -1;
michael@0 930
michael@0 931 // Check for simple *, where the construct being repeated
michael@0 932 // compiled to single opcode, and might be optimizable.
michael@0 933 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
michael@0 934 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
michael@0 935
michael@0 936 if (URX_TYPE(repeatedOp) == URX_SETREF) {
michael@0 937 // Emit optimized code for a [char set]*
michael@0 938 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
michael@0 939 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
michael@0 940 dataLoc = fRXPat->fFrameSize;
michael@0 941 fRXPat->fFrameSize++;
michael@0 942 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
michael@0 943 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
michael@0 944 break;
michael@0 945 }
michael@0 946
michael@0 947 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
michael@0 948 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
michael@0 949 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
michael@0 950 // Emit Optimized code for .* operations.
michael@0 951 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
michael@0 952 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
michael@0 953 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
michael@0 954 loopOpI |= 1;
michael@0 955 }
michael@0 956 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
michael@0 957 loopOpI |= 2;
michael@0 958 }
michael@0 959 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
michael@0 960 dataLoc = fRXPat->fFrameSize;
michael@0 961 fRXPat->fFrameSize++;
michael@0 962 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
michael@0 963 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
michael@0 964 break;
michael@0 965 }
michael@0 966 }
michael@0 967
michael@0 968 // Emit general case code for this *
michael@0 969 // The optimizations did not apply.
michael@0 970
michael@0 971 int32_t saveStateLoc = blockTopLoc(TRUE);
michael@0 972 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
michael@0 973
michael@0 974 // Check for minimum match length of zero, which requires
michael@0 975 // extra loop-breaking code.
michael@0 976 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
michael@0 977 insertOp(saveStateLoc);
michael@0 978 dataLoc = fRXPat->fFrameSize;
michael@0 979 fRXPat->fFrameSize++;
michael@0 980
michael@0 981 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
michael@0 982 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
michael@0 983 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
michael@0 984 }
michael@0 985
michael@0 986 // Locate the position in the compiled pattern where the match will continue
michael@0 987 // after completing the *. (4 or 5 in the comment above)
michael@0 988 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
michael@0 989
michael@0 990 // Put together the save state op store it into the compiled code.
michael@0 991 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
michael@0 992 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
michael@0 993
michael@0 994 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
michael@0 995 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
michael@0 996 }
michael@0 997 break;
michael@0 998
michael@0 999 case doNGStar:
michael@0 1000 // Non-greedy *? quantifier
michael@0 1001 // compiles to
michael@0 1002 // 1. JMP 3
michael@0 1003 // 2. body of stuff being iterated over
michael@0 1004 // 3. STATE_SAVE 2
michael@0 1005 // 4 ...
michael@0 1006 {
michael@0 1007 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
michael@0 1008 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
michael@0 1009 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
michael@0 1010 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
michael@0 1011 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
michael@0 1012 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
michael@0 1013 }
michael@0 1014 break;
michael@0 1015
michael@0 1016
michael@0 1017 case doIntervalInit:
michael@0 1018 // The '{' opening an interval quantifier was just scanned.
michael@0 1019 // Init the counter varaiables that will accumulate the values as the digits
michael@0 1020 // are scanned.
michael@0 1021 fIntervalLow = 0;
michael@0 1022 fIntervalUpper = -1;
michael@0 1023 break;
michael@0 1024
michael@0 1025 case doIntevalLowerDigit:
michael@0 1026 // Scanned a digit from the lower value of an {lower,upper} interval
michael@0 1027 {
michael@0 1028 int32_t digitValue = u_charDigitValue(fC.fChar);
michael@0 1029 U_ASSERT(digitValue >= 0);
michael@0 1030 fIntervalLow = fIntervalLow*10 + digitValue;
michael@0 1031 if (fIntervalLow < 0) {
michael@0 1032 error(U_REGEX_NUMBER_TOO_BIG);
michael@0 1033 }
michael@0 1034 }
michael@0 1035 break;
michael@0 1036
michael@0 1037 case doIntervalUpperDigit:
michael@0 1038 // Scanned a digit from the upper value of an {lower,upper} interval
michael@0 1039 {
michael@0 1040 if (fIntervalUpper < 0) {
michael@0 1041 fIntervalUpper = 0;
michael@0 1042 }
michael@0 1043 int32_t digitValue = u_charDigitValue(fC.fChar);
michael@0 1044 U_ASSERT(digitValue >= 0);
michael@0 1045 fIntervalUpper = fIntervalUpper*10 + digitValue;
michael@0 1046 if (fIntervalUpper < 0) {
michael@0 1047 error(U_REGEX_NUMBER_TOO_BIG);
michael@0 1048 }
michael@0 1049 }
michael@0 1050 break;
michael@0 1051
michael@0 1052 case doIntervalSame:
michael@0 1053 // Scanned a single value interval like {27}. Upper = Lower.
michael@0 1054 fIntervalUpper = fIntervalLow;
michael@0 1055 break;
michael@0 1056
michael@0 1057 case doInterval:
michael@0 1058 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
michael@0 1059 if (compileInlineInterval() == FALSE) {
michael@0 1060 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
michael@0 1061 }
michael@0 1062 break;
michael@0 1063
michael@0 1064 case doPossessiveInterval:
michael@0 1065 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
michael@0 1066 {
michael@0 1067 // Remember the loc for the top of the block being looped over.
michael@0 1068 // (Can not reserve a slot in the compiled pattern at this time, because
michael@0 1069 // compileInterval needs to reserve also, and blockTopLoc can only reserve
michael@0 1070 // once per block.)
michael@0 1071 int32_t topLoc = blockTopLoc(FALSE);
michael@0 1072
michael@0 1073 // Produce normal looping code.
michael@0 1074 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
michael@0 1075
michael@0 1076 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
michael@0 1077 // just as if the loop was inclosed in atomic parentheses.
michael@0 1078
michael@0 1079 // First the STO_SP before the start of the loop
michael@0 1080 insertOp(topLoc);
michael@0 1081 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
michael@0 1082 fRXPat->fDataSize += 1; // state stack ptr.
michael@0 1083 int32_t op = URX_BUILD(URX_STO_SP, varLoc);
michael@0 1084 fRXPat->fCompiledPat->setElementAt(op, topLoc);
michael@0 1085
michael@0 1086 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
michael@0 1087 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
michael@0 1088 loopOp++; // point LoopOp after the just-inserted STO_SP
michael@0 1089 fRXPat->fCompiledPat->push(loopOp, *fStatus);
michael@0 1090
michael@0 1091 // Then the LD_SP after the end of the loop
michael@0 1092 op = URX_BUILD(URX_LD_SP, varLoc);
michael@0 1093 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1094 }
michael@0 1095
michael@0 1096 break;
michael@0 1097
michael@0 1098 case doNGInterval:
michael@0 1099 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
michael@0 1100 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
michael@0 1101 break;
michael@0 1102
michael@0 1103 case doIntervalError:
michael@0 1104 error(U_REGEX_BAD_INTERVAL);
michael@0 1105 break;
michael@0 1106
michael@0 1107 case doLiteralChar:
michael@0 1108 // We've just scanned a "normal" character from the pattern,
michael@0 1109 literalChar(fC.fChar);
michael@0 1110 break;
michael@0 1111
michael@0 1112
michael@0 1113 case doEscapedLiteralChar:
michael@0 1114 // We've just scanned an backslashed escaped character with no
michael@0 1115 // special meaning. It represents itself.
michael@0 1116 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
michael@0 1117 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
michael@0 1118 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
michael@0 1119 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 1120 }
michael@0 1121 literalChar(fC.fChar);
michael@0 1122 break;
michael@0 1123
michael@0 1124
michael@0 1125 case doDotAny:
michael@0 1126 // scanned a ".", match any single character.
michael@0 1127 {
michael@0 1128 fixLiterals(FALSE);
michael@0 1129 int32_t op;
michael@0 1130 if (fModeFlags & UREGEX_DOTALL) {
michael@0 1131 op = URX_BUILD(URX_DOTANY_ALL, 0);
michael@0 1132 } else if (fModeFlags & UREGEX_UNIX_LINES) {
michael@0 1133 op = URX_BUILD(URX_DOTANY_UNIX, 0);
michael@0 1134 } else {
michael@0 1135 op = URX_BUILD(URX_DOTANY, 0);
michael@0 1136 }
michael@0 1137 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1138 }
michael@0 1139 break;
michael@0 1140
michael@0 1141 case doCaret:
michael@0 1142 {
michael@0 1143 fixLiterals(FALSE);
michael@0 1144 int32_t op = 0;
michael@0 1145 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
michael@0 1146 op = URX_CARET;
michael@0 1147 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
michael@0 1148 op = URX_CARET_M;
michael@0 1149 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
michael@0 1150 op = URX_CARET; // Only testing true start of input.
michael@0 1151 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
michael@0 1152 op = URX_CARET_M_UNIX;
michael@0 1153 }
michael@0 1154 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
michael@0 1155 }
michael@0 1156 break;
michael@0 1157
michael@0 1158 case doDollar:
michael@0 1159 {
michael@0 1160 fixLiterals(FALSE);
michael@0 1161 int32_t op = 0;
michael@0 1162 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
michael@0 1163 op = URX_DOLLAR;
michael@0 1164 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
michael@0 1165 op = URX_DOLLAR_M;
michael@0 1166 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
michael@0 1167 op = URX_DOLLAR_D;
michael@0 1168 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
michael@0 1169 op = URX_DOLLAR_MD;
michael@0 1170 }
michael@0 1171 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
michael@0 1172 }
michael@0 1173 break;
michael@0 1174
michael@0 1175 case doBackslashA:
michael@0 1176 fixLiterals(FALSE);
michael@0 1177 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
michael@0 1178 break;
michael@0 1179
michael@0 1180 case doBackslashB:
michael@0 1181 {
michael@0 1182 #if UCONFIG_NO_BREAK_ITERATION==1
michael@0 1183 if (fModeFlags & UREGEX_UWORD) {
michael@0 1184 error(U_UNSUPPORTED_ERROR);
michael@0 1185 }
michael@0 1186 #endif
michael@0 1187 fixLiterals(FALSE);
michael@0 1188 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
michael@0 1189 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
michael@0 1190 }
michael@0 1191 break;
michael@0 1192
michael@0 1193 case doBackslashb:
michael@0 1194 {
michael@0 1195 #if UCONFIG_NO_BREAK_ITERATION==1
michael@0 1196 if (fModeFlags & UREGEX_UWORD) {
michael@0 1197 error(U_UNSUPPORTED_ERROR);
michael@0 1198 }
michael@0 1199 #endif
michael@0 1200 fixLiterals(FALSE);
michael@0 1201 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
michael@0 1202 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
michael@0 1203 }
michael@0 1204 break;
michael@0 1205
michael@0 1206 case doBackslashD:
michael@0 1207 fixLiterals(FALSE);
michael@0 1208 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
michael@0 1209 break;
michael@0 1210
michael@0 1211 case doBackslashd:
michael@0 1212 fixLiterals(FALSE);
michael@0 1213 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
michael@0 1214 break;
michael@0 1215
michael@0 1216 case doBackslashG:
michael@0 1217 fixLiterals(FALSE);
michael@0 1218 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
michael@0 1219 break;
michael@0 1220
michael@0 1221 case doBackslashS:
michael@0 1222 fixLiterals(FALSE);
michael@0 1223 fRXPat->fCompiledPat->addElement(
michael@0 1224 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
michael@0 1225 break;
michael@0 1226
michael@0 1227 case doBackslashs:
michael@0 1228 fixLiterals(FALSE);
michael@0 1229 fRXPat->fCompiledPat->addElement(
michael@0 1230 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
michael@0 1231 break;
michael@0 1232
michael@0 1233 case doBackslashW:
michael@0 1234 fixLiterals(FALSE);
michael@0 1235 fRXPat->fCompiledPat->addElement(
michael@0 1236 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
michael@0 1237 break;
michael@0 1238
michael@0 1239 case doBackslashw:
michael@0 1240 fixLiterals(FALSE);
michael@0 1241 fRXPat->fCompiledPat->addElement(
michael@0 1242 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
michael@0 1243 break;
michael@0 1244
michael@0 1245 case doBackslashX:
michael@0 1246 fixLiterals(FALSE);
michael@0 1247 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
michael@0 1248 break;
michael@0 1249
michael@0 1250
michael@0 1251 case doBackslashZ:
michael@0 1252 fixLiterals(FALSE);
michael@0 1253 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
michael@0 1254 break;
michael@0 1255
michael@0 1256 case doBackslashz:
michael@0 1257 fixLiterals(FALSE);
michael@0 1258 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
michael@0 1259 break;
michael@0 1260
michael@0 1261 case doEscapeError:
michael@0 1262 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 1263 break;
michael@0 1264
michael@0 1265 case doExit:
michael@0 1266 fixLiterals(FALSE);
michael@0 1267 returnVal = FALSE;
michael@0 1268 break;
michael@0 1269
michael@0 1270 case doProperty:
michael@0 1271 {
michael@0 1272 fixLiterals(FALSE);
michael@0 1273 UnicodeSet *theSet = scanProp();
michael@0 1274 compileSet(theSet);
michael@0 1275 }
michael@0 1276 break;
michael@0 1277
michael@0 1278 case doNamedChar:
michael@0 1279 {
michael@0 1280 UChar32 c = scanNamedChar();
michael@0 1281 literalChar(c);
michael@0 1282 }
michael@0 1283 break;
michael@0 1284
michael@0 1285
michael@0 1286 case doBackRef:
michael@0 1287 // BackReference. Somewhat unusual in that the front-end can not completely parse
michael@0 1288 // the regular expression, because the number of digits to be consumed
michael@0 1289 // depends on the number of capture groups that have been defined. So
michael@0 1290 // we have to do it here instead.
michael@0 1291 {
michael@0 1292 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
michael@0 1293 int32_t groupNum = 0;
michael@0 1294 UChar32 c = fC.fChar;
michael@0 1295
michael@0 1296 for (;;) {
michael@0 1297 // Loop once per digit, for max allowed number of digits in a back reference.
michael@0 1298 int32_t digit = u_charDigitValue(c);
michael@0 1299 groupNum = groupNum * 10 + digit;
michael@0 1300 if (groupNum >= numCaptureGroups) {
michael@0 1301 break;
michael@0 1302 }
michael@0 1303 c = peekCharLL();
michael@0 1304 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
michael@0 1305 break;
michael@0 1306 }
michael@0 1307 nextCharLL();
michael@0 1308 }
michael@0 1309
michael@0 1310 // Scan of the back reference in the source regexp is complete. Now generate
michael@0 1311 // the compiled code for it.
michael@0 1312 // Because capture groups can be forward-referenced by back-references,
michael@0 1313 // we fill the operand with the capture group number. At the end
michael@0 1314 // of compilation, it will be changed to the variable's location.
michael@0 1315 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
michael@0 1316 // and shouldn't enter this code path at all.
michael@0 1317 fixLiterals(FALSE);
michael@0 1318 int32_t op;
michael@0 1319 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
michael@0 1320 op = URX_BUILD(URX_BACKREF_I, groupNum);
michael@0 1321 } else {
michael@0 1322 op = URX_BUILD(URX_BACKREF, groupNum);
michael@0 1323 }
michael@0 1324 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1325 }
michael@0 1326 break;
michael@0 1327
michael@0 1328
michael@0 1329 case doPossessivePlus:
michael@0 1330 // Possessive ++ quantifier.
michael@0 1331 // Compiles to
michael@0 1332 // 1. STO_SP
michael@0 1333 // 2. body of stuff being iterated over
michael@0 1334 // 3. STATE_SAVE 5
michael@0 1335 // 4. JMP 2
michael@0 1336 // 5. LD_SP
michael@0 1337 // 6. ...
michael@0 1338 //
michael@0 1339 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
michael@0 1340 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
michael@0 1341 //
michael@0 1342 {
michael@0 1343 // Emit the STO_SP
michael@0 1344 int32_t topLoc = blockTopLoc(TRUE);
michael@0 1345 int32_t stoLoc = fRXPat->fDataSize;
michael@0 1346 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
michael@0 1347 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
michael@0 1348 fRXPat->fCompiledPat->setElementAt(op, topLoc);
michael@0 1349
michael@0 1350 // Emit the STATE_SAVE
michael@0 1351 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
michael@0 1352 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1353
michael@0 1354 // Emit the JMP
michael@0 1355 op = URX_BUILD(URX_JMP, topLoc+1);
michael@0 1356 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1357
michael@0 1358 // Emit the LD_SP
michael@0 1359 op = URX_BUILD(URX_LD_SP, stoLoc);
michael@0 1360 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1361 }
michael@0 1362 break;
michael@0 1363
michael@0 1364 case doPossessiveStar:
michael@0 1365 // Possessive *+ quantifier.
michael@0 1366 // Compiles to
michael@0 1367 // 1. STO_SP loc
michael@0 1368 // 2. STATE_SAVE 5
michael@0 1369 // 3. body of stuff being iterated over
michael@0 1370 // 4. JMP 2
michael@0 1371 // 5. LD_SP loc
michael@0 1372 // 6 ...
michael@0 1373 // TODO: do something to cut back the state stack each time through the loop.
michael@0 1374 {
michael@0 1375 // Reserve two slots at the top of the block.
michael@0 1376 int32_t topLoc = blockTopLoc(TRUE);
michael@0 1377 insertOp(topLoc);
michael@0 1378
michael@0 1379 // emit STO_SP loc
michael@0 1380 int32_t stoLoc = fRXPat->fDataSize;
michael@0 1381 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
michael@0 1382 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
michael@0 1383 fRXPat->fCompiledPat->setElementAt(op, topLoc);
michael@0 1384
michael@0 1385 // Emit the SAVE_STATE 5
michael@0 1386 int32_t L7 = fRXPat->fCompiledPat->size()+1;
michael@0 1387 op = URX_BUILD(URX_STATE_SAVE, L7);
michael@0 1388 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
michael@0 1389
michael@0 1390 // Append the JMP operation.
michael@0 1391 op = URX_BUILD(URX_JMP, topLoc+1);
michael@0 1392 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1393
michael@0 1394 // Emit the LD_SP loc
michael@0 1395 op = URX_BUILD(URX_LD_SP, stoLoc);
michael@0 1396 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1397 }
michael@0 1398 break;
michael@0 1399
michael@0 1400 case doPossessiveOpt:
michael@0 1401 // Possessive ?+ quantifier.
michael@0 1402 // Compiles to
michael@0 1403 // 1. STO_SP loc
michael@0 1404 // 2. SAVE_STATE 5
michael@0 1405 // 3. body of optional block
michael@0 1406 // 4. LD_SP loc
michael@0 1407 // 5. ...
michael@0 1408 //
michael@0 1409 {
michael@0 1410 // Reserve two slots at the top of the block.
michael@0 1411 int32_t topLoc = blockTopLoc(TRUE);
michael@0 1412 insertOp(topLoc);
michael@0 1413
michael@0 1414 // Emit the STO_SP
michael@0 1415 int32_t stoLoc = fRXPat->fDataSize;
michael@0 1416 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
michael@0 1417 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
michael@0 1418 fRXPat->fCompiledPat->setElementAt(op, topLoc);
michael@0 1419
michael@0 1420 // Emit the SAVE_STATE
michael@0 1421 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
michael@0 1422 op = URX_BUILD(URX_STATE_SAVE, continueLoc);
michael@0 1423 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
michael@0 1424
michael@0 1425 // Emit the LD_SP
michael@0 1426 op = URX_BUILD(URX_LD_SP, stoLoc);
michael@0 1427 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1428 }
michael@0 1429 break;
michael@0 1430
michael@0 1431
michael@0 1432 case doBeginMatchMode:
michael@0 1433 fNewModeFlags = fModeFlags;
michael@0 1434 fSetModeFlag = TRUE;
michael@0 1435 break;
michael@0 1436
michael@0 1437 case doMatchMode: // (?i) and similar
michael@0 1438 {
michael@0 1439 int32_t bit = 0;
michael@0 1440 switch (fC.fChar) {
michael@0 1441 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
michael@0 1442 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
michael@0 1443 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
michael@0 1444 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
michael@0 1445 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
michael@0 1446 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
michael@0 1447 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
michael@0 1448 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
michael@0 1449 default:
michael@0 1450 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
michael@0 1451 // by the scanner.
michael@0 1452 }
michael@0 1453 if (fSetModeFlag) {
michael@0 1454 fNewModeFlags |= bit;
michael@0 1455 } else {
michael@0 1456 fNewModeFlags &= ~bit;
michael@0 1457 }
michael@0 1458 }
michael@0 1459 break;
michael@0 1460
michael@0 1461 case doSetMatchMode:
michael@0 1462 // Emit code to match any pending literals, using the not-yet changed match mode.
michael@0 1463 fixLiterals();
michael@0 1464
michael@0 1465 // We've got a (?i) or similar. The match mode is being changed, but
michael@0 1466 // the change is not scoped to a parenthesized block.
michael@0 1467 U_ASSERT(fNewModeFlags < 0);
michael@0 1468 fModeFlags = fNewModeFlags;
michael@0 1469
michael@0 1470 break;
michael@0 1471
michael@0 1472
michael@0 1473 case doMatchModeParen:
michael@0 1474 // We've got a (?i: or similar. Begin a parenthesized block, save old
michael@0 1475 // mode flags so they can be restored at the close of the block.
michael@0 1476 //
michael@0 1477 // Compile to a
michael@0 1478 // - NOP, which later may be replaced by a save-state if the
michael@0 1479 // parenthesized group gets a * quantifier, followed by
michael@0 1480 // - NOP, which may later be replaced by a save-state if there
michael@0 1481 // is an '|' alternation within the parens.
michael@0 1482 {
michael@0 1483 fixLiterals(FALSE);
michael@0 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 1485 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
michael@0 1486
michael@0 1487 // On the Parentheses stack, start a new frame and add the postions
michael@0 1488 // of the two NOPs (a normal non-capturing () frame, except for the
michael@0 1489 // saving of the orignal mode flags.)
michael@0 1490 fParenStack.push(fModeFlags, *fStatus);
michael@0 1491 fParenStack.push(flags, *fStatus); // Frame Marker
michael@0 1492 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
michael@0 1493 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
michael@0 1494
michael@0 1495 // Set the current mode flags to the new values.
michael@0 1496 U_ASSERT(fNewModeFlags < 0);
michael@0 1497 fModeFlags = fNewModeFlags;
michael@0 1498 }
michael@0 1499 break;
michael@0 1500
michael@0 1501 case doBadModeFlag:
michael@0 1502 error(U_REGEX_INVALID_FLAG);
michael@0 1503 break;
michael@0 1504
michael@0 1505 case doSuppressComments:
michael@0 1506 // We have just scanned a '(?'. We now need to prevent the character scanner from
michael@0 1507 // treating a '#' as a to-the-end-of-line comment.
michael@0 1508 // (This Perl compatibility just gets uglier and uglier to do...)
michael@0 1509 fEOLComments = FALSE;
michael@0 1510 break;
michael@0 1511
michael@0 1512
michael@0 1513 case doSetAddAmp:
michael@0 1514 {
michael@0 1515 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1516 set->add(chAmp);
michael@0 1517 }
michael@0 1518 break;
michael@0 1519
michael@0 1520 case doSetAddDash:
michael@0 1521 {
michael@0 1522 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1523 set->add(chDash);
michael@0 1524 }
michael@0 1525 break;
michael@0 1526
michael@0 1527 case doSetBackslash_s:
michael@0 1528 {
michael@0 1529 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1530 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
michael@0 1531 break;
michael@0 1532 }
michael@0 1533
michael@0 1534 case doSetBackslash_S:
michael@0 1535 {
michael@0 1536 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1537 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
michael@0 1538 SSet.complement();
michael@0 1539 set->addAll(SSet);
michael@0 1540 break;
michael@0 1541 }
michael@0 1542
michael@0 1543 case doSetBackslash_d:
michael@0 1544 {
michael@0 1545 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1546 // TODO - make a static set, ticket 6058.
michael@0 1547 addCategory(set, U_GC_ND_MASK, *fStatus);
michael@0 1548 break;
michael@0 1549 }
michael@0 1550
michael@0 1551 case doSetBackslash_D:
michael@0 1552 {
michael@0 1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1554 UnicodeSet digits;
michael@0 1555 // TODO - make a static set, ticket 6058.
michael@0 1556 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
michael@0 1557 digits.complement();
michael@0 1558 set->addAll(digits);
michael@0 1559 break;
michael@0 1560 }
michael@0 1561
michael@0 1562 case doSetBackslash_w:
michael@0 1563 {
michael@0 1564 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1565 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
michael@0 1566 break;
michael@0 1567 }
michael@0 1568
michael@0 1569 case doSetBackslash_W:
michael@0 1570 {
michael@0 1571 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
michael@0 1572 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
michael@0 1573 SSet.complement();
michael@0 1574 set->addAll(SSet);
michael@0 1575 break;
michael@0 1576 }
michael@0 1577
michael@0 1578 case doSetBegin:
michael@0 1579 fixLiterals(FALSE);
michael@0 1580 fSetStack.push(new UnicodeSet(), *fStatus);
michael@0 1581 fSetOpStack.push(setStart, *fStatus);
michael@0 1582 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
michael@0 1583 fSetOpStack.push(setCaseClose, *fStatus);
michael@0 1584 }
michael@0 1585 break;
michael@0 1586
michael@0 1587 case doSetBeginDifference1:
michael@0 1588 // We have scanned something like [[abc]-[
michael@0 1589 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
michael@0 1590 // Push a Difference operator, which will cause the new set to be subtracted from what
michael@0 1591 // went before once it is created.
michael@0 1592 setPushOp(setDifference1);
michael@0 1593 fSetOpStack.push(setStart, *fStatus);
michael@0 1594 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
michael@0 1595 fSetOpStack.push(setCaseClose, *fStatus);
michael@0 1596 }
michael@0 1597 break;
michael@0 1598
michael@0 1599 case doSetBeginIntersection1:
michael@0 1600 // We have scanned something like [[abc]&[
michael@0 1601 // Need both the '&' operator and the open '[' operator.
michael@0 1602 setPushOp(setIntersection1);
michael@0 1603 fSetOpStack.push(setStart, *fStatus);
michael@0 1604 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
michael@0 1605 fSetOpStack.push(setCaseClose, *fStatus);
michael@0 1606 }
michael@0 1607 break;
michael@0 1608
michael@0 1609 case doSetBeginUnion:
michael@0 1610 // We have scanned something like [[abc][
michael@0 1611 // Need to handle the union operation explicitly [[abc] | [
michael@0 1612 setPushOp(setUnion);
michael@0 1613 fSetOpStack.push(setStart, *fStatus);
michael@0 1614 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
michael@0 1615 fSetOpStack.push(setCaseClose, *fStatus);
michael@0 1616 }
michael@0 1617 break;
michael@0 1618
michael@0 1619 case doSetDifference2:
michael@0 1620 // We have scanned something like [abc--
michael@0 1621 // Consider this to unambiguously be a set difference operator.
michael@0 1622 setPushOp(setDifference2);
michael@0 1623 break;
michael@0 1624
michael@0 1625 case doSetEnd:
michael@0 1626 // Have encountered the ']' that closes a set.
michael@0 1627 // Force the evaluation of any pending operations within this set,
michael@0 1628 // leave the completed set on the top of the set stack.
michael@0 1629 setEval(setEnd);
michael@0 1630 U_ASSERT(fSetOpStack.peeki()==setStart);
michael@0 1631 fSetOpStack.popi();
michael@0 1632 break;
michael@0 1633
michael@0 1634 case doSetFinish:
michael@0 1635 {
michael@0 1636 // Finished a complete set expression, including all nested sets.
michael@0 1637 // The close bracket has already triggered clearing out pending set operators,
michael@0 1638 // the operator stack should be empty and the operand stack should have just
michael@0 1639 // one entry, the result set.
michael@0 1640 U_ASSERT(fSetOpStack.empty());
michael@0 1641 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
michael@0 1642 U_ASSERT(fSetStack.empty());
michael@0 1643 compileSet(theSet);
michael@0 1644 break;
michael@0 1645 }
michael@0 1646
michael@0 1647 case doSetIntersection2:
michael@0 1648 // Have scanned something like [abc&&
michael@0 1649 setPushOp(setIntersection2);
michael@0 1650 break;
michael@0 1651
michael@0 1652 case doSetLiteral:
michael@0 1653 // Union the just-scanned literal character into the set being built.
michael@0 1654 // This operation is the highest precedence set operation, so we can always do
michael@0 1655 // it immediately, without waiting to see what follows. It is necessary to perform
michael@0 1656 // any pending '-' or '&' operation first, because these have the same precedence
michael@0 1657 // as union-ing in a literal'
michael@0 1658 {
michael@0 1659 setEval(setUnion);
michael@0 1660 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
michael@0 1661 s->add(fC.fChar);
michael@0 1662 fLastSetLiteral = fC.fChar;
michael@0 1663 break;
michael@0 1664 }
michael@0 1665
michael@0 1666 case doSetLiteralEscaped:
michael@0 1667 // A back-slash escaped literal character was encountered.
michael@0 1668 // Processing is the same as with setLiteral, above, with the addition of
michael@0 1669 // the optional check for errors on escaped ASCII letters.
michael@0 1670 {
michael@0 1671 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
michael@0 1672 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
michael@0 1673 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
michael@0 1674 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 1675 }
michael@0 1676 setEval(setUnion);
michael@0 1677 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
michael@0 1678 s->add(fC.fChar);
michael@0 1679 fLastSetLiteral = fC.fChar;
michael@0 1680 break;
michael@0 1681 }
michael@0 1682
michael@0 1683 case doSetNamedChar:
michael@0 1684 // Scanning a \N{UNICODE CHARACTER NAME}
michael@0 1685 // Aside from the source of the character, the processing is identical to doSetLiteral,
michael@0 1686 // above.
michael@0 1687 {
michael@0 1688 UChar32 c = scanNamedChar();
michael@0 1689 setEval(setUnion);
michael@0 1690 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
michael@0 1691 s->add(c);
michael@0 1692 fLastSetLiteral = c;
michael@0 1693 break;
michael@0 1694 }
michael@0 1695
michael@0 1696 case doSetNamedRange:
michael@0 1697 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
michael@0 1698 // The left character is already in the set, and is saved in fLastSetLiteral.
michael@0 1699 // The right side needs to be picked up, the scan is at the 'N'.
michael@0 1700 // Lower Limit > Upper limit being an error matches both Java
michael@0 1701 // and ICU UnicodeSet behavior.
michael@0 1702 {
michael@0 1703 UChar32 c = scanNamedChar();
michael@0 1704 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
michael@0 1705 error(U_REGEX_INVALID_RANGE);
michael@0 1706 }
michael@0 1707 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
michael@0 1708 s->add(fLastSetLiteral, c);
michael@0 1709 fLastSetLiteral = c;
michael@0 1710 break;
michael@0 1711 }
michael@0 1712
michael@0 1713
michael@0 1714 case doSetNegate:
michael@0 1715 // Scanned a '^' at the start of a set.
michael@0 1716 // Push the negation operator onto the set op stack.
michael@0 1717 // A twist for case-insensitive matching:
michael@0 1718 // the case closure operation must happen _before_ negation.
michael@0 1719 // But the case closure operation will already be on the stack if it's required.
michael@0 1720 // This requires checking for case closure, and swapping the stack order
michael@0 1721 // if it is present.
michael@0 1722 {
michael@0 1723 int32_t tosOp = fSetOpStack.peeki();
michael@0 1724 if (tosOp == setCaseClose) {
michael@0 1725 fSetOpStack.popi();
michael@0 1726 fSetOpStack.push(setNegation, *fStatus);
michael@0 1727 fSetOpStack.push(setCaseClose, *fStatus);
michael@0 1728 } else {
michael@0 1729 fSetOpStack.push(setNegation, *fStatus);
michael@0 1730 }
michael@0 1731 }
michael@0 1732 break;
michael@0 1733
michael@0 1734 case doSetNoCloseError:
michael@0 1735 error(U_REGEX_MISSING_CLOSE_BRACKET);
michael@0 1736 break;
michael@0 1737
michael@0 1738 case doSetOpError:
michael@0 1739 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
michael@0 1740 break;
michael@0 1741
michael@0 1742 case doSetPosixProp:
michael@0 1743 {
michael@0 1744 UnicodeSet *s = scanPosixProp();
michael@0 1745 if (s != NULL) {
michael@0 1746 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
michael@0 1747 tos->addAll(*s);
michael@0 1748 delete s;
michael@0 1749 } // else error. scanProp() reported the error status already.
michael@0 1750 }
michael@0 1751 break;
michael@0 1752
michael@0 1753 case doSetProp:
michael@0 1754 // Scanned a \p \P within [brackets].
michael@0 1755 {
michael@0 1756 UnicodeSet *s = scanProp();
michael@0 1757 if (s != NULL) {
michael@0 1758 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
michael@0 1759 tos->addAll(*s);
michael@0 1760 delete s;
michael@0 1761 } // else error. scanProp() reported the error status already.
michael@0 1762 }
michael@0 1763 break;
michael@0 1764
michael@0 1765
michael@0 1766 case doSetRange:
michael@0 1767 // We have scanned literal-literal. Add the range to the set.
michael@0 1768 // The left character is already in the set, and is saved in fLastSetLiteral.
michael@0 1769 // The right side is the current character.
michael@0 1770 // Lower Limit > Upper limit being an error matches both Java
michael@0 1771 // and ICU UnicodeSet behavior.
michael@0 1772 {
michael@0 1773 if (fLastSetLiteral > fC.fChar) {
michael@0 1774 error(U_REGEX_INVALID_RANGE);
michael@0 1775 }
michael@0 1776 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
michael@0 1777 s->add(fLastSetLiteral, fC.fChar);
michael@0 1778 break;
michael@0 1779 }
michael@0 1780
michael@0 1781 default:
michael@0 1782 U_ASSERT(FALSE);
michael@0 1783 error(U_REGEX_INTERNAL_ERROR);
michael@0 1784 break;
michael@0 1785 }
michael@0 1786
michael@0 1787 if (U_FAILURE(*fStatus)) {
michael@0 1788 returnVal = FALSE;
michael@0 1789 }
michael@0 1790
michael@0 1791 return returnVal;
michael@0 1792 }
michael@0 1793
michael@0 1794
michael@0 1795
michael@0 1796 //------------------------------------------------------------------------------
michael@0 1797 //
michael@0 1798 // literalChar We've encountered a literal character from the pattern,
michael@0 1799 // or an escape sequence that reduces to a character.
michael@0 1800 // Add it to the string containing all literal chars/strings from
michael@0 1801 // the pattern.
michael@0 1802 //
michael@0 1803 //------------------------------------------------------------------------------
michael@0 1804 void RegexCompile::literalChar(UChar32 c) {
michael@0 1805 fLiteralChars.append(c);
michael@0 1806 }
michael@0 1807
michael@0 1808
michael@0 1809 //------------------------------------------------------------------------------
michael@0 1810 //
michael@0 1811 // fixLiterals When compiling something that can follow a literal
michael@0 1812 // string in a pattern, emit the code to match the
michael@0 1813 // accumulated literal string.
michael@0 1814 //
michael@0 1815 // Optionally, split the last char of the string off into
michael@0 1816 // a single "ONE_CHAR" operation, so that quantifiers can
michael@0 1817 // apply to that char alone. Example: abc*
michael@0 1818 // The * must apply to the 'c' only.
michael@0 1819 //
michael@0 1820 //------------------------------------------------------------------------------
michael@0 1821 void RegexCompile::fixLiterals(UBool split) {
michael@0 1822 int32_t op = 0; // An op from/for the compiled pattern.
michael@0 1823
michael@0 1824 // If no literal characters have been scanned but not yet had code generated
michael@0 1825 // for them, nothing needs to be done.
michael@0 1826 if (fLiteralChars.length() == 0) {
michael@0 1827 return;
michael@0 1828 }
michael@0 1829
michael@0 1830 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
michael@0 1831 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
michael@0 1832
michael@0 1833 // Split: We need to ensure that the last item in the compiled pattern
michael@0 1834 // refers only to the last literal scanned in the pattern, so that
michael@0 1835 // quantifiers (*, +, etc.) affect only it, and not a longer string.
michael@0 1836 // Split before case folding for case insensitive matches.
michael@0 1837
michael@0 1838 if (split) {
michael@0 1839 fLiteralChars.truncate(indexOfLastCodePoint);
michael@0 1840 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
michael@0 1841 // Note that the truncated literal string may be empty, in which case
michael@0 1842 // nothing will be emitted.
michael@0 1843
michael@0 1844 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
michael@0 1845 fixLiterals(FALSE); // Second recursive call, code for the final code point.
michael@0 1846 return;
michael@0 1847 }
michael@0 1848
michael@0 1849 // If we are doing case-insensitive matching, case fold the string. This may expand
michael@0 1850 // the string, e.g. the German sharp-s turns into "ss"
michael@0 1851 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
michael@0 1852 fLiteralChars.foldCase();
michael@0 1853 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
michael@0 1854 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
michael@0 1855 }
michael@0 1856
michael@0 1857 if (indexOfLastCodePoint == 0) {
michael@0 1858 // Single character, emit a URX_ONECHAR op to match it.
michael@0 1859 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
michael@0 1860 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
michael@0 1861 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
michael@0 1862 } else {
michael@0 1863 op = URX_BUILD(URX_ONECHAR, lastCodePoint);
michael@0 1864 }
michael@0 1865 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1866 } else {
michael@0 1867 // Two or more chars, emit a URX_STRING to match them.
michael@0 1868 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
michael@0 1869 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
michael@0 1870 } else {
michael@0 1871 // TODO here: add optimization to split case sensitive strings of length two
michael@0 1872 // into two single char ops, for efficiency.
michael@0 1873 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
michael@0 1874 }
michael@0 1875 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1876 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
michael@0 1877 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 1878
michael@0 1879 // Add this string into the accumulated strings of the compiled pattern.
michael@0 1880 fRXPat->fLiteralText.append(fLiteralChars);
michael@0 1881 }
michael@0 1882
michael@0 1883 fLiteralChars.remove();
michael@0 1884 }
michael@0 1885
michael@0 1886
michael@0 1887
michael@0 1888
michael@0 1889
michael@0 1890
michael@0 1891 //------------------------------------------------------------------------------
michael@0 1892 //
michael@0 1893 // insertOp() Insert a slot for a new opcode into the already
michael@0 1894 // compiled pattern code.
michael@0 1895 //
michael@0 1896 // Fill the slot with a NOP. Our caller will replace it
michael@0 1897 // with what they really wanted.
michael@0 1898 //
michael@0 1899 //------------------------------------------------------------------------------
michael@0 1900 void RegexCompile::insertOp(int32_t where) {
michael@0 1901 UVector64 *code = fRXPat->fCompiledPat;
michael@0 1902 U_ASSERT(where>0 && where < code->size());
michael@0 1903
michael@0 1904 int32_t nop = URX_BUILD(URX_NOP, 0);
michael@0 1905 code->insertElementAt(nop, where, *fStatus);
michael@0 1906
michael@0 1907 // Walk through the pattern, looking for any ops with targets that
michael@0 1908 // were moved down by the insert. Fix them.
michael@0 1909 int32_t loc;
michael@0 1910 for (loc=0; loc<code->size(); loc++) {
michael@0 1911 int32_t op = (int32_t)code->elementAti(loc);
michael@0 1912 int32_t opType = URX_TYPE(op);
michael@0 1913 int32_t opValue = URX_VAL(op);
michael@0 1914 if ((opType == URX_JMP ||
michael@0 1915 opType == URX_JMPX ||
michael@0 1916 opType == URX_STATE_SAVE ||
michael@0 1917 opType == URX_CTR_LOOP ||
michael@0 1918 opType == URX_CTR_LOOP_NG ||
michael@0 1919 opType == URX_JMP_SAV ||
michael@0 1920 opType == URX_JMP_SAV_X ||
michael@0 1921 opType == URX_RELOC_OPRND) && opValue > where) {
michael@0 1922 // Target location for this opcode is after the insertion point and
michael@0 1923 // needs to be incremented to adjust for the insertion.
michael@0 1924 opValue++;
michael@0 1925 op = URX_BUILD(opType, opValue);
michael@0 1926 code->setElementAt(op, loc);
michael@0 1927 }
michael@0 1928 }
michael@0 1929
michael@0 1930 // Now fix up the parentheses stack. All positive values in it are locations in
michael@0 1931 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
michael@0 1932 for (loc=0; loc<fParenStack.size(); loc++) {
michael@0 1933 int32_t x = fParenStack.elementAti(loc);
michael@0 1934 U_ASSERT(x < code->size());
michael@0 1935 if (x>where) {
michael@0 1936 x++;
michael@0 1937 fParenStack.setElementAt(x, loc);
michael@0 1938 }
michael@0 1939 }
michael@0 1940
michael@0 1941 if (fMatchCloseParen > where) {
michael@0 1942 fMatchCloseParen++;
michael@0 1943 }
michael@0 1944 if (fMatchOpenParen > where) {
michael@0 1945 fMatchOpenParen++;
michael@0 1946 }
michael@0 1947 }
michael@0 1948
michael@0 1949
michael@0 1950
michael@0 1951 //------------------------------------------------------------------------------
michael@0 1952 //
michael@0 1953 // blockTopLoc() Find or create a location in the compiled pattern
michael@0 1954 // at the start of the operation or block that has
michael@0 1955 // just been compiled. Needed when a quantifier (* or
michael@0 1956 // whatever) appears, and we need to add an operation
michael@0 1957 // at the start of the thing being quantified.
michael@0 1958 //
michael@0 1959 // (Parenthesized Blocks) have a slot with a NOP that
michael@0 1960 // is reserved for this purpose. .* or similar don't
michael@0 1961 // and a slot needs to be added.
michael@0 1962 //
michael@0 1963 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
michael@0 1964 // at the returned location.
michael@0 1965 // FALSE - just return the address,
michael@0 1966 // do not reserve a location there.
michael@0 1967 //
michael@0 1968 //------------------------------------------------------------------------------
michael@0 1969 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
michael@0 1970 int32_t theLoc;
michael@0 1971 fixLiterals(TRUE); // Emit code for any pending literals.
michael@0 1972 // If last item was a string, emit separate op for the its last char.
michael@0 1973 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
michael@0 1974 {
michael@0 1975 // The item just processed is a parenthesized block.
michael@0 1976 theLoc = fMatchOpenParen; // A slot is already reserved for us.
michael@0 1977 U_ASSERT(theLoc > 0);
michael@0 1978 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
michael@0 1979 }
michael@0 1980 else {
michael@0 1981 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
michael@0 1982 // No slot for STATE_SAVE was pre-reserved in the compiled code.
michael@0 1983 // We need to make space now.
michael@0 1984 theLoc = fRXPat->fCompiledPat->size()-1;
michael@0 1985 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
michael@0 1986 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
michael@0 1987 // Strings take two opcode, we want the position of the first one.
michael@0 1988 // We can have a string at this point if a single character case-folded to two.
michael@0 1989 theLoc--;
michael@0 1990 }
michael@0 1991 if (reserveLoc) {
michael@0 1992 int32_t nop = URX_BUILD(URX_NOP, 0);
michael@0 1993 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
michael@0 1994 }
michael@0 1995 }
michael@0 1996 return theLoc;
michael@0 1997 }
michael@0 1998
michael@0 1999
michael@0 2000
michael@0 2001 //------------------------------------------------------------------------------
michael@0 2002 //
michael@0 2003 // handleCloseParen When compiling a close paren, we need to go back
michael@0 2004 // and fix up any JMP or SAVE operations within the
michael@0 2005 // parenthesized block that need to target the end
michael@0 2006 // of the block. The locations of these are kept on
michael@0 2007 // the paretheses stack.
michael@0 2008 //
michael@0 2009 // This function is called both when encountering a
michael@0 2010 // real ) and at the end of the pattern.
michael@0 2011 //
michael@0 2012 //------------------------------------------------------------------------------
michael@0 2013 void RegexCompile::handleCloseParen() {
michael@0 2014 int32_t patIdx;
michael@0 2015 int32_t patOp;
michael@0 2016 if (fParenStack.size() <= 0) {
michael@0 2017 error(U_REGEX_MISMATCHED_PAREN);
michael@0 2018 return;
michael@0 2019 }
michael@0 2020
michael@0 2021 // Emit code for any pending literals.
michael@0 2022 fixLiterals(FALSE);
michael@0 2023
michael@0 2024 // Fixup any operations within the just-closed parenthesized group
michael@0 2025 // that need to reference the end of the (block).
michael@0 2026 // (The first one popped from the stack is an unused slot for
michael@0 2027 // alternation (OR) state save, but applying the fixup to it does no harm.)
michael@0 2028 for (;;) {
michael@0 2029 patIdx = fParenStack.popi();
michael@0 2030 if (patIdx < 0) {
michael@0 2031 // value < 0 flags the start of the frame on the paren stack.
michael@0 2032 break;
michael@0 2033 }
michael@0 2034 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
michael@0 2035 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
michael@0 2036 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
michael@0 2037 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
michael@0 2038 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
michael@0 2039 fMatchOpenParen = patIdx;
michael@0 2040 }
michael@0 2041
michael@0 2042 // At the close of any parenthesized block, restore the match mode flags to
michael@0 2043 // the value they had at the open paren. Saved value is
michael@0 2044 // at the top of the paren stack.
michael@0 2045 fModeFlags = fParenStack.popi();
michael@0 2046 U_ASSERT(fModeFlags < 0);
michael@0 2047
michael@0 2048 // DO any additional fixups, depending on the specific kind of
michael@0 2049 // parentesized grouping this is
michael@0 2050
michael@0 2051 switch (patIdx) {
michael@0 2052 case plain:
michael@0 2053 case flags:
michael@0 2054 // No additional fixups required.
michael@0 2055 // (Grouping-only parentheses)
michael@0 2056 break;
michael@0 2057 case capturing:
michael@0 2058 // Capturing Parentheses.
michael@0 2059 // Insert a End Capture op into the pattern.
michael@0 2060 // The frame offset of the variables for this cg is obtained from the
michael@0 2061 // start capture op and put it into the end-capture op.
michael@0 2062 {
michael@0 2063 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
michael@0 2064 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
michael@0 2065
michael@0 2066 int32_t frameVarLocation = URX_VAL(captureOp);
michael@0 2067 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
michael@0 2068 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
michael@0 2069 }
michael@0 2070 break;
michael@0 2071 case atomic:
michael@0 2072 // Atomic Parenthesis.
michael@0 2073 // Insert a LD_SP operation to restore the state stack to the position
michael@0 2074 // it was when the atomic parens were entered.
michael@0 2075 {
michael@0 2076 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
michael@0 2077 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
michael@0 2078 int32_t stoLoc = URX_VAL(stoOp);
michael@0 2079 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
michael@0 2080 fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
michael@0 2081 }
michael@0 2082 break;
michael@0 2083
michael@0 2084 case lookAhead:
michael@0 2085 {
michael@0 2086 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
michael@0 2087 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
michael@0 2088 int32_t dataLoc = URX_VAL(startOp);
michael@0 2089 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
michael@0 2090 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2091 }
michael@0 2092 break;
michael@0 2093
michael@0 2094 case negLookAhead:
michael@0 2095 {
michael@0 2096 // See comment at doOpenLookAheadNeg
michael@0 2097 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
michael@0 2098 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
michael@0 2099 int32_t dataLoc = URX_VAL(startOp);
michael@0 2100 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
michael@0 2101 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2102 op = URX_BUILD(URX_BACKTRACK, 0);
michael@0 2103 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2104 op = URX_BUILD(URX_LA_END, dataLoc);
michael@0 2105 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2106
michael@0 2107 // Patch the URX_SAVE near the top of the block.
michael@0 2108 // The destination of the SAVE is the final LA_END that was just added.
michael@0 2109 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
michael@0 2110 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
michael@0 2111 int32_t dest = fRXPat->fCompiledPat->size()-1;
michael@0 2112 saveOp = URX_BUILD(URX_STATE_SAVE, dest);
michael@0 2113 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
michael@0 2114 }
michael@0 2115 break;
michael@0 2116
michael@0 2117 case lookBehind:
michael@0 2118 {
michael@0 2119 // See comment at doOpenLookBehind.
michael@0 2120
michael@0 2121 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
michael@0 2122 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
michael@0 2123 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
michael@0 2124 int32_t dataLoc = URX_VAL(startOp);
michael@0 2125 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
michael@0 2126 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2127 op = URX_BUILD(URX_LA_END, dataLoc);
michael@0 2128 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2129
michael@0 2130 // Determine the min and max bounds for the length of the
michael@0 2131 // string that the pattern can match.
michael@0 2132 // An unbounded upper limit is an error.
michael@0 2133 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
michael@0 2134 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
michael@0 2135 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
michael@0 2136 if (maxML == INT32_MAX) {
michael@0 2137 error(U_REGEX_LOOK_BEHIND_LIMIT);
michael@0 2138 break;
michael@0 2139 }
michael@0 2140 U_ASSERT(minML <= maxML);
michael@0 2141
michael@0 2142 // Insert the min and max match len bounds into the URX_LB_CONT op that
michael@0 2143 // appears at the top of the look-behind block, at location fMatchOpenParen+1
michael@0 2144 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
michael@0 2145 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
michael@0 2146
michael@0 2147 }
michael@0 2148 break;
michael@0 2149
michael@0 2150
michael@0 2151
michael@0 2152 case lookBehindN:
michael@0 2153 {
michael@0 2154 // See comment at doOpenLookBehindNeg.
michael@0 2155
michael@0 2156 // Append the URX_LBN_END to the compiled pattern.
michael@0 2157 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
michael@0 2158 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
michael@0 2159 int32_t dataLoc = URX_VAL(startOp);
michael@0 2160 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
michael@0 2161 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2162
michael@0 2163 // Determine the min and max bounds for the length of the
michael@0 2164 // string that the pattern can match.
michael@0 2165 // An unbounded upper limit is an error.
michael@0 2166 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
michael@0 2167 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
michael@0 2168 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
michael@0 2169 if (maxML == INT32_MAX) {
michael@0 2170 error(U_REGEX_LOOK_BEHIND_LIMIT);
michael@0 2171 break;
michael@0 2172 }
michael@0 2173 U_ASSERT(minML <= maxML);
michael@0 2174
michael@0 2175 // Insert the min and max match len bounds into the URX_LB_CONT op that
michael@0 2176 // appears at the top of the look-behind block, at location fMatchOpenParen+1
michael@0 2177 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
michael@0 2178 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
michael@0 2179
michael@0 2180 // Insert the pattern location to continue at after a successful match
michael@0 2181 // as the last operand of the URX_LBN_CONT
michael@0 2182 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
michael@0 2183 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
michael@0 2184 }
michael@0 2185 break;
michael@0 2186
michael@0 2187
michael@0 2188
michael@0 2189 default:
michael@0 2190 U_ASSERT(FALSE);
michael@0 2191 }
michael@0 2192
michael@0 2193 // remember the next location in the compiled pattern.
michael@0 2194 // The compilation of Quantifiers will look at this to see whether its looping
michael@0 2195 // over a parenthesized block or a single item
michael@0 2196 fMatchCloseParen = fRXPat->fCompiledPat->size();
michael@0 2197 }
michael@0 2198
michael@0 2199
michael@0 2200
michael@0 2201 //------------------------------------------------------------------------------
michael@0 2202 //
michael@0 2203 // compileSet Compile the pattern operations for a reference to a
michael@0 2204 // UnicodeSet.
michael@0 2205 //
michael@0 2206 //------------------------------------------------------------------------------
michael@0 2207 void RegexCompile::compileSet(UnicodeSet *theSet)
michael@0 2208 {
michael@0 2209 if (theSet == NULL) {
michael@0 2210 return;
michael@0 2211 }
michael@0 2212 // Remove any strings from the set.
michael@0 2213 // There shoudn't be any, but just in case.
michael@0 2214 // (Case Closure can add them; if we had a simple case closure avaialble that
michael@0 2215 // ignored strings, that would be better.)
michael@0 2216 theSet->removeAllStrings();
michael@0 2217 int32_t setSize = theSet->size();
michael@0 2218
michael@0 2219 switch (setSize) {
michael@0 2220 case 0:
michael@0 2221 {
michael@0 2222 // Set of no elements. Always fails to match.
michael@0 2223 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
michael@0 2224 delete theSet;
michael@0 2225 }
michael@0 2226 break;
michael@0 2227
michael@0 2228 case 1:
michael@0 2229 {
michael@0 2230 // The set contains only a single code point. Put it into
michael@0 2231 // the compiled pattern as a single char operation rather
michael@0 2232 // than a set, and discard the set itself.
michael@0 2233 literalChar(theSet->charAt(0));
michael@0 2234 delete theSet;
michael@0 2235 }
michael@0 2236 break;
michael@0 2237
michael@0 2238 default:
michael@0 2239 {
michael@0 2240 // The set contains two or more chars. (the normal case)
michael@0 2241 // Put it into the compiled pattern as a set.
michael@0 2242 int32_t setNumber = fRXPat->fSets->size();
michael@0 2243 fRXPat->fSets->addElement(theSet, *fStatus);
michael@0 2244 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
michael@0 2245 fRXPat->fCompiledPat->addElement(setOp, *fStatus);
michael@0 2246 }
michael@0 2247 }
michael@0 2248 }
michael@0 2249
michael@0 2250
michael@0 2251 //------------------------------------------------------------------------------
michael@0 2252 //
michael@0 2253 // compileInterval Generate the code for a {min, max} style interval quantifier.
michael@0 2254 // Except for the specific opcodes used, the code is the same
michael@0 2255 // for all three types (greedy, non-greedy, possessive) of
michael@0 2256 // intervals. The opcodes are supplied as parameters.
michael@0 2257 // (There are two sets of opcodes - greedy & possessive use the
michael@0 2258 // same ones, while non-greedy has it's own.)
michael@0 2259 //
michael@0 2260 // The code for interval loops has this form:
michael@0 2261 // 0 CTR_INIT counter loc (in stack frame)
michael@0 2262 // 1 5 patt address of CTR_LOOP at bottom of block
michael@0 2263 // 2 min count
michael@0 2264 // 3 max count (-1 for unbounded)
michael@0 2265 // 4 ... block to be iterated over
michael@0 2266 // 5 CTR_LOOP
michael@0 2267 //
michael@0 2268 // In
michael@0 2269 //------------------------------------------------------------------------------
michael@0 2270 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
michael@0 2271 {
michael@0 2272 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
michael@0 2273 // four slots in the compiled code. Reserve them.
michael@0 2274 int32_t topOfBlock = blockTopLoc(TRUE);
michael@0 2275 insertOp(topOfBlock);
michael@0 2276 insertOp(topOfBlock);
michael@0 2277 insertOp(topOfBlock);
michael@0 2278
michael@0 2279 // The operands for the CTR_INIT opcode include the index in the matcher data
michael@0 2280 // of the counter. Allocate it now. There are two data items
michael@0 2281 // counterLoc --> Loop counter
michael@0 2282 // +1 --> Input index (for breaking non-progressing loops)
michael@0 2283 // (Only present if unbounded upper limit on loop)
michael@0 2284 int32_t counterLoc = fRXPat->fFrameSize;
michael@0 2285 fRXPat->fFrameSize++;
michael@0 2286 if (fIntervalUpper < 0) {
michael@0 2287 fRXPat->fFrameSize++;
michael@0 2288 }
michael@0 2289
michael@0 2290 int32_t op = URX_BUILD(InitOp, counterLoc);
michael@0 2291 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
michael@0 2292
michael@0 2293 // The second operand of CTR_INIT is the location following the end of the loop.
michael@0 2294 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
michael@0 2295 // compilation of something later on causes the code to grow and the target
michael@0 2296 // position to move.
michael@0 2297 int32_t loopEnd = fRXPat->fCompiledPat->size();
michael@0 2298 op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
michael@0 2299 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
michael@0 2300
michael@0 2301 // Followed by the min and max counts.
michael@0 2302 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
michael@0 2303 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
michael@0 2304
michael@0 2305 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
michael@0 2306 // Goes at end of the block being looped over, so just append to the code so far.
michael@0 2307 op = URX_BUILD(LoopOp, topOfBlock);
michael@0 2308 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2309
michael@0 2310 if ((fIntervalLow & 0xff000000) != 0 ||
michael@0 2311 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
michael@0 2312 error(U_REGEX_NUMBER_TOO_BIG);
michael@0 2313 }
michael@0 2314
michael@0 2315 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
michael@0 2316 error(U_REGEX_MAX_LT_MIN);
michael@0 2317 }
michael@0 2318 }
michael@0 2319
michael@0 2320
michael@0 2321
michael@0 2322 UBool RegexCompile::compileInlineInterval() {
michael@0 2323 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
michael@0 2324 // Too big to inline. Fail, which will cause looping code to be generated.
michael@0 2325 // (Upper < Lower picks up unbounded upper and errors, both.)
michael@0 2326 return FALSE;
michael@0 2327 }
michael@0 2328
michael@0 2329 int32_t topOfBlock = blockTopLoc(FALSE);
michael@0 2330 if (fIntervalUpper == 0) {
michael@0 2331 // Pathological case. Attempt no matches, as if the block doesn't exist.
michael@0 2332 fRXPat->fCompiledPat->setSize(topOfBlock);
michael@0 2333 return TRUE;
michael@0 2334 }
michael@0 2335
michael@0 2336 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
michael@0 2337 // The thing being repeated is not a single op, but some
michael@0 2338 // more complex block. Do it as a loop, not inlines.
michael@0 2339 // Note that things "repeated" a max of once are handled as inline, because
michael@0 2340 // the one copy of the code already generated is just fine.
michael@0 2341 return FALSE;
michael@0 2342 }
michael@0 2343
michael@0 2344 // Pick up the opcode that is to be repeated
michael@0 2345 //
michael@0 2346 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
michael@0 2347
michael@0 2348 // Compute the pattern location where the inline sequence
michael@0 2349 // will end, and set up the state save op that will be needed.
michael@0 2350 //
michael@0 2351 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
michael@0 2352 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
michael@0 2353 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
michael@0 2354 if (fIntervalLow == 0) {
michael@0 2355 insertOp(topOfBlock);
michael@0 2356 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
michael@0 2357 }
michael@0 2358
michael@0 2359
michael@0 2360
michael@0 2361 // Loop, emitting the op for the thing being repeated each time.
michael@0 2362 // Loop starts at 1 because one instance of the op already exists in the pattern,
michael@0 2363 // it was put there when it was originally encountered.
michael@0 2364 int32_t i;
michael@0 2365 for (i=1; i<fIntervalUpper; i++ ) {
michael@0 2366 if (i == fIntervalLow) {
michael@0 2367 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
michael@0 2368 }
michael@0 2369 if (i > fIntervalLow) {
michael@0 2370 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
michael@0 2371 }
michael@0 2372 fRXPat->fCompiledPat->addElement(op, *fStatus);
michael@0 2373 }
michael@0 2374 return TRUE;
michael@0 2375 }
michael@0 2376
michael@0 2377
michael@0 2378
michael@0 2379 //------------------------------------------------------------------------------
michael@0 2380 //
michael@0 2381 // matchStartType Determine how a match can start.
michael@0 2382 // Used to optimize find() operations.
michael@0 2383 //
michael@0 2384 // Operation is very similar to minMatchLength(). Walk the compiled
michael@0 2385 // pattern, keeping an on-going minimum-match-length. For any
michael@0 2386 // op where the min match coming in is zero, add that ops possible
michael@0 2387 // starting matches to the possible starts for the overall pattern.
michael@0 2388 //
michael@0 2389 //------------------------------------------------------------------------------
michael@0 2390 void RegexCompile::matchStartType() {
michael@0 2391 if (U_FAILURE(*fStatus)) {
michael@0 2392 return;
michael@0 2393 }
michael@0 2394
michael@0 2395
michael@0 2396 int32_t loc; // Location in the pattern of the current op being processed.
michael@0 2397 int32_t op; // The op being processed
michael@0 2398 int32_t opType; // The opcode type of the op
michael@0 2399 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
michael@0 2400 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
michael@0 2401
michael@0 2402 UBool atStart = TRUE; // True if no part of the pattern yet encountered
michael@0 2403 // could have advanced the position in a match.
michael@0 2404 // (Maximum match length so far == 0)
michael@0 2405
michael@0 2406 // forwardedLength is a vector holding minimum-match-length values that
michael@0 2407 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
michael@0 2408 // It must be one longer than the pattern being checked because some ops
michael@0 2409 // will jmp to a end-of-block+1 location from within a block, and we must
michael@0 2410 // count those when checking the block.
michael@0 2411 int32_t end = fRXPat->fCompiledPat->size();
michael@0 2412 UVector32 forwardedLength(end+1, *fStatus);
michael@0 2413 forwardedLength.setSize(end+1);
michael@0 2414 for (loc=3; loc<end; loc++) {
michael@0 2415 forwardedLength.setElementAt(INT32_MAX, loc);
michael@0 2416 }
michael@0 2417
michael@0 2418 for (loc = 3; loc<end; loc++) {
michael@0 2419 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 2420 opType = URX_TYPE(op);
michael@0 2421
michael@0 2422 // The loop is advancing linearly through the pattern.
michael@0 2423 // If the op we are now at was the destination of a branch in the pattern,
michael@0 2424 // and that path has a shorter minimum length than the current accumulated value,
michael@0 2425 // replace the current accumulated value.
michael@0 2426 if (forwardedLength.elementAti(loc) < currentLen) {
michael@0 2427 currentLen = forwardedLength.elementAti(loc);
michael@0 2428 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
michael@0 2429 }
michael@0 2430
michael@0 2431 switch (opType) {
michael@0 2432 // Ops that don't change the total length matched
michael@0 2433 case URX_RESERVED_OP:
michael@0 2434 case URX_END:
michael@0 2435 case URX_FAIL:
michael@0 2436 case URX_STRING_LEN:
michael@0 2437 case URX_NOP:
michael@0 2438 case URX_START_CAPTURE:
michael@0 2439 case URX_END_CAPTURE:
michael@0 2440 case URX_BACKSLASH_B:
michael@0 2441 case URX_BACKSLASH_BU:
michael@0 2442 case URX_BACKSLASH_G:
michael@0 2443 case URX_BACKSLASH_Z:
michael@0 2444 case URX_DOLLAR:
michael@0 2445 case URX_DOLLAR_M:
michael@0 2446 case URX_DOLLAR_D:
michael@0 2447 case URX_DOLLAR_MD:
michael@0 2448 case URX_RELOC_OPRND:
michael@0 2449 case URX_STO_INP_LOC:
michael@0 2450 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
michael@0 2451 case URX_BACKREF_I:
michael@0 2452
michael@0 2453 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
michael@0 2454 case URX_LD_SP:
michael@0 2455 break;
michael@0 2456
michael@0 2457 case URX_CARET:
michael@0 2458 if (atStart) {
michael@0 2459 fRXPat->fStartType = START_START;
michael@0 2460 }
michael@0 2461 break;
michael@0 2462
michael@0 2463 case URX_CARET_M:
michael@0 2464 case URX_CARET_M_UNIX:
michael@0 2465 if (atStart) {
michael@0 2466 fRXPat->fStartType = START_LINE;
michael@0 2467 }
michael@0 2468 break;
michael@0 2469
michael@0 2470 case URX_ONECHAR:
michael@0 2471 if (currentLen == 0) {
michael@0 2472 // This character could appear at the start of a match.
michael@0 2473 // Add it to the set of possible starting characters.
michael@0 2474 fRXPat->fInitialChars->add(URX_VAL(op));
michael@0 2475 numInitialStrings += 2;
michael@0 2476 }
michael@0 2477 currentLen++;
michael@0 2478 atStart = FALSE;
michael@0 2479 break;
michael@0 2480
michael@0 2481
michael@0 2482 case URX_SETREF:
michael@0 2483 if (currentLen == 0) {
michael@0 2484 int32_t sn = URX_VAL(op);
michael@0 2485 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
michael@0 2486 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
michael@0 2487 fRXPat->fInitialChars->addAll(*s);
michael@0 2488 numInitialStrings += 2;
michael@0 2489 }
michael@0 2490 currentLen++;
michael@0 2491 atStart = FALSE;
michael@0 2492 break;
michael@0 2493
michael@0 2494 case URX_LOOP_SR_I:
michael@0 2495 // [Set]*, like a SETREF, above, in what it can match,
michael@0 2496 // but may not match at all, so currentLen is not incremented.
michael@0 2497 if (currentLen == 0) {
michael@0 2498 int32_t sn = URX_VAL(op);
michael@0 2499 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
michael@0 2500 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
michael@0 2501 fRXPat->fInitialChars->addAll(*s);
michael@0 2502 numInitialStrings += 2;
michael@0 2503 }
michael@0 2504 atStart = FALSE;
michael@0 2505 break;
michael@0 2506
michael@0 2507 case URX_LOOP_DOT_I:
michael@0 2508 if (currentLen == 0) {
michael@0 2509 // .* at the start of a pattern.
michael@0 2510 // Any character can begin the match.
michael@0 2511 fRXPat->fInitialChars->clear();
michael@0 2512 fRXPat->fInitialChars->complement();
michael@0 2513 numInitialStrings += 2;
michael@0 2514 }
michael@0 2515 atStart = FALSE;
michael@0 2516 break;
michael@0 2517
michael@0 2518
michael@0 2519 case URX_STATIC_SETREF:
michael@0 2520 if (currentLen == 0) {
michael@0 2521 int32_t sn = URX_VAL(op);
michael@0 2522 U_ASSERT(sn>0 && sn<URX_LAST_SET);
michael@0 2523 const UnicodeSet *s = fRXPat->fStaticSets[sn];
michael@0 2524 fRXPat->fInitialChars->addAll(*s);
michael@0 2525 numInitialStrings += 2;
michael@0 2526 }
michael@0 2527 currentLen++;
michael@0 2528 atStart = FALSE;
michael@0 2529 break;
michael@0 2530
michael@0 2531
michael@0 2532
michael@0 2533 case URX_STAT_SETREF_N:
michael@0 2534 if (currentLen == 0) {
michael@0 2535 int32_t sn = URX_VAL(op);
michael@0 2536 const UnicodeSet *s = fRXPat->fStaticSets[sn];
michael@0 2537 UnicodeSet sc(*s);
michael@0 2538 sc.complement();
michael@0 2539 fRXPat->fInitialChars->addAll(sc);
michael@0 2540 numInitialStrings += 2;
michael@0 2541 }
michael@0 2542 currentLen++;
michael@0 2543 atStart = FALSE;
michael@0 2544 break;
michael@0 2545
michael@0 2546
michael@0 2547
michael@0 2548 case URX_BACKSLASH_D:
michael@0 2549 // Digit Char
michael@0 2550 if (currentLen == 0) {
michael@0 2551 UnicodeSet s;
michael@0 2552 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
michael@0 2553 if (URX_VAL(op) != 0) {
michael@0 2554 s.complement();
michael@0 2555 }
michael@0 2556 fRXPat->fInitialChars->addAll(s);
michael@0 2557 numInitialStrings += 2;
michael@0 2558 }
michael@0 2559 currentLen++;
michael@0 2560 atStart = FALSE;
michael@0 2561 break;
michael@0 2562
michael@0 2563
michael@0 2564 case URX_ONECHAR_I:
michael@0 2565 // Case Insensitive Single Character.
michael@0 2566 if (currentLen == 0) {
michael@0 2567 UChar32 c = URX_VAL(op);
michael@0 2568 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
michael@0 2569
michael@0 2570 // Disable optimizations on first char of match.
michael@0 2571 // TODO: Compute the set of chars that case fold to this char, or to
michael@0 2572 // a string that begins with this char.
michael@0 2573 // For simple case folding, this code worked:
michael@0 2574 // UnicodeSet s(c, c);
michael@0 2575 // s.closeOver(USET_CASE_INSENSITIVE);
michael@0 2576 // fRXPat->fInitialChars->addAll(s);
michael@0 2577
michael@0 2578 fRXPat->fInitialChars->clear();
michael@0 2579 fRXPat->fInitialChars->complement();
michael@0 2580 } else {
michael@0 2581 // Char has no case variants. Just add it as-is to the
michael@0 2582 // set of possible starting chars.
michael@0 2583 fRXPat->fInitialChars->add(c);
michael@0 2584 }
michael@0 2585 numInitialStrings += 2;
michael@0 2586 }
michael@0 2587 currentLen++;
michael@0 2588 atStart = FALSE;
michael@0 2589 break;
michael@0 2590
michael@0 2591
michael@0 2592 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
michael@0 2593 case URX_DOTANY_ALL: // . matches one or two.
michael@0 2594 case URX_DOTANY:
michael@0 2595 case URX_DOTANY_UNIX:
michael@0 2596 if (currentLen == 0) {
michael@0 2597 // These constructs are all bad news when they appear at the start
michael@0 2598 // of a match. Any character can begin the match.
michael@0 2599 fRXPat->fInitialChars->clear();
michael@0 2600 fRXPat->fInitialChars->complement();
michael@0 2601 numInitialStrings += 2;
michael@0 2602 }
michael@0 2603 currentLen++;
michael@0 2604 atStart = FALSE;
michael@0 2605 break;
michael@0 2606
michael@0 2607
michael@0 2608 case URX_JMPX:
michael@0 2609 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
michael@0 2610 case URX_JMP:
michael@0 2611 {
michael@0 2612 int32_t jmpDest = URX_VAL(op);
michael@0 2613 if (jmpDest < loc) {
michael@0 2614 // Loop of some kind. Can safely ignore, the worst that will happen
michael@0 2615 // is that we understate the true minimum length
michael@0 2616 currentLen = forwardedLength.elementAti(loc+1);
michael@0 2617
michael@0 2618 } else {
michael@0 2619 // Forward jump. Propagate the current min length to the target loc of the jump.
michael@0 2620 U_ASSERT(jmpDest <= end+1);
michael@0 2621 if (forwardedLength.elementAti(jmpDest) > currentLen) {
michael@0 2622 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 2623 }
michael@0 2624 }
michael@0 2625 }
michael@0 2626 atStart = FALSE;
michael@0 2627 break;
michael@0 2628
michael@0 2629 case URX_JMP_SAV:
michael@0 2630 case URX_JMP_SAV_X:
michael@0 2631 // Combo of state save to the next loc, + jmp backwards.
michael@0 2632 // Net effect on min. length computation is nothing.
michael@0 2633 atStart = FALSE;
michael@0 2634 break;
michael@0 2635
michael@0 2636 case URX_BACKTRACK:
michael@0 2637 // Fails are kind of like a branch, except that the min length was
michael@0 2638 // propagated already, by the state save.
michael@0 2639 currentLen = forwardedLength.elementAti(loc+1);
michael@0 2640 atStart = FALSE;
michael@0 2641 break;
michael@0 2642
michael@0 2643
michael@0 2644 case URX_STATE_SAVE:
michael@0 2645 {
michael@0 2646 // State Save, for forward jumps, propagate the current minimum.
michael@0 2647 // of the state save.
michael@0 2648 int32_t jmpDest = URX_VAL(op);
michael@0 2649 if (jmpDest > loc) {
michael@0 2650 if (currentLen < forwardedLength.elementAti(jmpDest)) {
michael@0 2651 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 2652 }
michael@0 2653 }
michael@0 2654 }
michael@0 2655 atStart = FALSE;
michael@0 2656 break;
michael@0 2657
michael@0 2658
michael@0 2659
michael@0 2660
michael@0 2661 case URX_STRING:
michael@0 2662 {
michael@0 2663 loc++;
michael@0 2664 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 2665 int32_t stringLen = URX_VAL(stringLenOp);
michael@0 2666 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
michael@0 2667 U_ASSERT(stringLenOp >= 2);
michael@0 2668 if (currentLen == 0) {
michael@0 2669 // Add the starting character of this string to the set of possible starting
michael@0 2670 // characters for this pattern.
michael@0 2671 int32_t stringStartIdx = URX_VAL(op);
michael@0 2672 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
michael@0 2673 fRXPat->fInitialChars->add(c);
michael@0 2674
michael@0 2675 // Remember this string. After the entire pattern has been checked,
michael@0 2676 // if nothing else is identified that can start a match, we'll use it.
michael@0 2677 numInitialStrings++;
michael@0 2678 fRXPat->fInitialStringIdx = stringStartIdx;
michael@0 2679 fRXPat->fInitialStringLen = stringLen;
michael@0 2680 }
michael@0 2681
michael@0 2682 currentLen += stringLen;
michael@0 2683 atStart = FALSE;
michael@0 2684 }
michael@0 2685 break;
michael@0 2686
michael@0 2687 case URX_STRING_I:
michael@0 2688 {
michael@0 2689 // Case-insensitive string. Unlike exact-match strings, we won't
michael@0 2690 // attempt a string search for possible match positions. But we
michael@0 2691 // do update the set of possible starting characters.
michael@0 2692 loc++;
michael@0 2693 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 2694 int32_t stringLen = URX_VAL(stringLenOp);
michael@0 2695 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
michael@0 2696 U_ASSERT(stringLenOp >= 2);
michael@0 2697 if (currentLen == 0) {
michael@0 2698 // Add the starting character of this string to the set of possible starting
michael@0 2699 // characters for this pattern.
michael@0 2700 int32_t stringStartIdx = URX_VAL(op);
michael@0 2701 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
michael@0 2702 UnicodeSet s(c, c);
michael@0 2703
michael@0 2704 // TODO: compute correct set of starting chars for full case folding.
michael@0 2705 // For the moment, say any char can start.
michael@0 2706 // s.closeOver(USET_CASE_INSENSITIVE);
michael@0 2707 s.clear();
michael@0 2708 s.complement();
michael@0 2709
michael@0 2710 fRXPat->fInitialChars->addAll(s);
michael@0 2711 numInitialStrings += 2; // Matching on an initial string not possible.
michael@0 2712 }
michael@0 2713 currentLen += stringLen;
michael@0 2714 atStart = FALSE;
michael@0 2715 }
michael@0 2716 break;
michael@0 2717
michael@0 2718 case URX_CTR_INIT:
michael@0 2719 case URX_CTR_INIT_NG:
michael@0 2720 {
michael@0 2721 // Loop Init Ops. These don't change the min length, but they are 4 word ops
michael@0 2722 // so location must be updated accordingly.
michael@0 2723 // Loop Init Ops.
michael@0 2724 // If the min loop count == 0
michael@0 2725 // move loc forwards to the end of the loop, skipping over the body.
michael@0 2726 // If the min count is > 0,
michael@0 2727 // continue normal processing of the body of the loop.
michael@0 2728 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
michael@0 2729 loopEndLoc = URX_VAL(loopEndLoc);
michael@0 2730 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
michael@0 2731 if (minLoopCount == 0) {
michael@0 2732 // Min Loop Count of 0, treat like a forward branch and
michael@0 2733 // move the current minimum length up to the target
michael@0 2734 // (end of loop) location.
michael@0 2735 U_ASSERT(loopEndLoc <= end+1);
michael@0 2736 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
michael@0 2737 forwardedLength.setElementAt(currentLen, loopEndLoc);
michael@0 2738 }
michael@0 2739 }
michael@0 2740 loc+=3; // Skips over operands of CTR_INIT
michael@0 2741 }
michael@0 2742 atStart = FALSE;
michael@0 2743 break;
michael@0 2744
michael@0 2745
michael@0 2746 case URX_CTR_LOOP:
michael@0 2747 case URX_CTR_LOOP_NG:
michael@0 2748 // Loop ops.
michael@0 2749 // The jump is conditional, backwards only.
michael@0 2750 atStart = FALSE;
michael@0 2751 break;
michael@0 2752
michael@0 2753 case URX_LOOP_C:
michael@0 2754 // More loop ops. These state-save to themselves.
michael@0 2755 // don't change the minimum match
michael@0 2756 atStart = FALSE;
michael@0 2757 break;
michael@0 2758
michael@0 2759
michael@0 2760 case URX_LA_START:
michael@0 2761 case URX_LB_START:
michael@0 2762 {
michael@0 2763 // Look-around. Scan forward until the matching look-ahead end,
michael@0 2764 // without processing the look-around block. This is overly pessimistic.
michael@0 2765
michael@0 2766 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
michael@0 2767 // lookahead contains two LA_END instructions, so count goes up by two
michael@0 2768 // for each LA_START.
michael@0 2769 int32_t depth = (opType == URX_LA_START? 2: 1);
michael@0 2770 for (;;) {
michael@0 2771 loc++;
michael@0 2772 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 2773 if (URX_TYPE(op) == URX_LA_START) {
michael@0 2774 depth+=2;
michael@0 2775 }
michael@0 2776 if (URX_TYPE(op) == URX_LB_START) {
michael@0 2777 depth++;
michael@0 2778 }
michael@0 2779 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
michael@0 2780 depth--;
michael@0 2781 if (depth == 0) {
michael@0 2782 break;
michael@0 2783 }
michael@0 2784 }
michael@0 2785 if (URX_TYPE(op) == URX_STATE_SAVE) {
michael@0 2786 // Need this because neg lookahead blocks will FAIL to outside
michael@0 2787 // of the block.
michael@0 2788 int32_t jmpDest = URX_VAL(op);
michael@0 2789 if (jmpDest > loc) {
michael@0 2790 if (currentLen < forwardedLength.elementAti(jmpDest)) {
michael@0 2791 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 2792 }
michael@0 2793 }
michael@0 2794 }
michael@0 2795 U_ASSERT(loc <= end);
michael@0 2796 }
michael@0 2797 }
michael@0 2798 break;
michael@0 2799
michael@0 2800 case URX_LA_END:
michael@0 2801 case URX_LB_CONT:
michael@0 2802 case URX_LB_END:
michael@0 2803 case URX_LBN_CONT:
michael@0 2804 case URX_LBN_END:
michael@0 2805 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
michael@0 2806 // consumed by the scan in URX_LA_START and LB_START
michael@0 2807
michael@0 2808 break;
michael@0 2809
michael@0 2810 default:
michael@0 2811 U_ASSERT(FALSE);
michael@0 2812 }
michael@0 2813
michael@0 2814 }
michael@0 2815
michael@0 2816
michael@0 2817 // We have finished walking through the ops. Check whether some forward jump
michael@0 2818 // propagated a shorter length to location end+1.
michael@0 2819 if (forwardedLength.elementAti(end+1) < currentLen) {
michael@0 2820 currentLen = forwardedLength.elementAti(end+1);
michael@0 2821 }
michael@0 2822
michael@0 2823
michael@0 2824 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
michael@0 2825
michael@0 2826
michael@0 2827 // Sort out what we should check for when looking for candidate match start positions.
michael@0 2828 // In order of preference,
michael@0 2829 // 1. Start of input text buffer.
michael@0 2830 // 2. A literal string.
michael@0 2831 // 3. Start of line in multi-line mode.
michael@0 2832 // 4. A single literal character.
michael@0 2833 // 5. A character from a set of characters.
michael@0 2834 //
michael@0 2835 if (fRXPat->fStartType == START_START) {
michael@0 2836 // Match only at the start of an input text string.
michael@0 2837 // start type is already set. We're done.
michael@0 2838 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
michael@0 2839 // Match beginning only with a literal string.
michael@0 2840 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
michael@0 2841 U_ASSERT(fRXPat->fInitialChars->contains(c));
michael@0 2842 fRXPat->fStartType = START_STRING;
michael@0 2843 fRXPat->fInitialChar = c;
michael@0 2844 } else if (fRXPat->fStartType == START_LINE) {
michael@0 2845 // Match at start of line in Multi-Line mode.
michael@0 2846 // Nothing to do here; everything is already set.
michael@0 2847 } else if (fRXPat->fMinMatchLen == 0) {
michael@0 2848 // Zero length match possible. We could start anywhere.
michael@0 2849 fRXPat->fStartType = START_NO_INFO;
michael@0 2850 } else if (fRXPat->fInitialChars->size() == 1) {
michael@0 2851 // All matches begin with the same char.
michael@0 2852 fRXPat->fStartType = START_CHAR;
michael@0 2853 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
michael@0 2854 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
michael@0 2855 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
michael@0 2856 fRXPat->fMinMatchLen > 0) {
michael@0 2857 // Matches start with a set of character smaller than the set of all chars.
michael@0 2858 fRXPat->fStartType = START_SET;
michael@0 2859 } else {
michael@0 2860 // Matches can start with anything
michael@0 2861 fRXPat->fStartType = START_NO_INFO;
michael@0 2862 }
michael@0 2863
michael@0 2864 return;
michael@0 2865 }
michael@0 2866
michael@0 2867
michael@0 2868
michael@0 2869 //------------------------------------------------------------------------------
michael@0 2870 //
michael@0 2871 // minMatchLength Calculate the length of the shortest string that could
michael@0 2872 // match the specified pattern.
michael@0 2873 // Length is in 16 bit code units, not code points.
michael@0 2874 //
michael@0 2875 // The calculated length may not be exact. The returned
michael@0 2876 // value may be shorter than the actual minimum; it must
michael@0 2877 // never be longer.
michael@0 2878 //
michael@0 2879 // start and end are the range of p-code operations to be
michael@0 2880 // examined. The endpoints are included in the range.
michael@0 2881 //
michael@0 2882 //------------------------------------------------------------------------------
michael@0 2883 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
michael@0 2884 if (U_FAILURE(*fStatus)) {
michael@0 2885 return 0;
michael@0 2886 }
michael@0 2887
michael@0 2888 U_ASSERT(start <= end);
michael@0 2889 U_ASSERT(end < fRXPat->fCompiledPat->size());
michael@0 2890
michael@0 2891
michael@0 2892 int32_t loc;
michael@0 2893 int32_t op;
michael@0 2894 int32_t opType;
michael@0 2895 int32_t currentLen = 0;
michael@0 2896
michael@0 2897
michael@0 2898 // forwardedLength is a vector holding minimum-match-length values that
michael@0 2899 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
michael@0 2900 // It must be one longer than the pattern being checked because some ops
michael@0 2901 // will jmp to a end-of-block+1 location from within a block, and we must
michael@0 2902 // count those when checking the block.
michael@0 2903 UVector32 forwardedLength(end+2, *fStatus);
michael@0 2904 forwardedLength.setSize(end+2);
michael@0 2905 for (loc=start; loc<=end+1; loc++) {
michael@0 2906 forwardedLength.setElementAt(INT32_MAX, loc);
michael@0 2907 }
michael@0 2908
michael@0 2909 for (loc = start; loc<=end; loc++) {
michael@0 2910 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 2911 opType = URX_TYPE(op);
michael@0 2912
michael@0 2913 // The loop is advancing linearly through the pattern.
michael@0 2914 // If the op we are now at was the destination of a branch in the pattern,
michael@0 2915 // and that path has a shorter minimum length than the current accumulated value,
michael@0 2916 // replace the current accumulated value.
michael@0 2917 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
michael@0 2918 // no-match-possible cases.
michael@0 2919 if (forwardedLength.elementAti(loc) < currentLen) {
michael@0 2920 currentLen = forwardedLength.elementAti(loc);
michael@0 2921 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
michael@0 2922 }
michael@0 2923
michael@0 2924 switch (opType) {
michael@0 2925 // Ops that don't change the total length matched
michael@0 2926 case URX_RESERVED_OP:
michael@0 2927 case URX_END:
michael@0 2928 case URX_STRING_LEN:
michael@0 2929 case URX_NOP:
michael@0 2930 case URX_START_CAPTURE:
michael@0 2931 case URX_END_CAPTURE:
michael@0 2932 case URX_BACKSLASH_B:
michael@0 2933 case URX_BACKSLASH_BU:
michael@0 2934 case URX_BACKSLASH_G:
michael@0 2935 case URX_BACKSLASH_Z:
michael@0 2936 case URX_CARET:
michael@0 2937 case URX_DOLLAR:
michael@0 2938 case URX_DOLLAR_M:
michael@0 2939 case URX_DOLLAR_D:
michael@0 2940 case URX_DOLLAR_MD:
michael@0 2941 case URX_RELOC_OPRND:
michael@0 2942 case URX_STO_INP_LOC:
michael@0 2943 case URX_CARET_M:
michael@0 2944 case URX_CARET_M_UNIX:
michael@0 2945 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
michael@0 2946 case URX_BACKREF_I:
michael@0 2947
michael@0 2948 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
michael@0 2949 case URX_LD_SP:
michael@0 2950
michael@0 2951 case URX_JMP_SAV:
michael@0 2952 case URX_JMP_SAV_X:
michael@0 2953 break;
michael@0 2954
michael@0 2955
michael@0 2956 // Ops that match a minimum of one character (one or two 16 bit code units.)
michael@0 2957 //
michael@0 2958 case URX_ONECHAR:
michael@0 2959 case URX_STATIC_SETREF:
michael@0 2960 case URX_STAT_SETREF_N:
michael@0 2961 case URX_SETREF:
michael@0 2962 case URX_BACKSLASH_D:
michael@0 2963 case URX_ONECHAR_I:
michael@0 2964 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
michael@0 2965 case URX_DOTANY_ALL: // . matches one or two.
michael@0 2966 case URX_DOTANY:
michael@0 2967 case URX_DOTANY_UNIX:
michael@0 2968 currentLen++;
michael@0 2969 break;
michael@0 2970
michael@0 2971
michael@0 2972 case URX_JMPX:
michael@0 2973 loc++; // URX_JMPX has an extra operand, ignored here,
michael@0 2974 // otherwise processed identically to URX_JMP.
michael@0 2975 case URX_JMP:
michael@0 2976 {
michael@0 2977 int32_t jmpDest = URX_VAL(op);
michael@0 2978 if (jmpDest < loc) {
michael@0 2979 // Loop of some kind. Can safely ignore, the worst that will happen
michael@0 2980 // is that we understate the true minimum length
michael@0 2981 currentLen = forwardedLength.elementAti(loc+1);
michael@0 2982 } else {
michael@0 2983 // Forward jump. Propagate the current min length to the target loc of the jump.
michael@0 2984 U_ASSERT(jmpDest <= end+1);
michael@0 2985 if (forwardedLength.elementAti(jmpDest) > currentLen) {
michael@0 2986 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 2987 }
michael@0 2988 }
michael@0 2989 }
michael@0 2990 break;
michael@0 2991
michael@0 2992 case URX_BACKTRACK:
michael@0 2993 {
michael@0 2994 // Back-tracks are kind of like a branch, except that the min length was
michael@0 2995 // propagated already, by the state save.
michael@0 2996 currentLen = forwardedLength.elementAti(loc+1);
michael@0 2997 }
michael@0 2998 break;
michael@0 2999
michael@0 3000
michael@0 3001 case URX_STATE_SAVE:
michael@0 3002 {
michael@0 3003 // State Save, for forward jumps, propagate the current minimum.
michael@0 3004 // of the state save.
michael@0 3005 int32_t jmpDest = URX_VAL(op);
michael@0 3006 if (jmpDest > loc) {
michael@0 3007 if (currentLen < forwardedLength.elementAti(jmpDest)) {
michael@0 3008 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 3009 }
michael@0 3010 }
michael@0 3011 }
michael@0 3012 break;
michael@0 3013
michael@0 3014
michael@0 3015 case URX_STRING:
michael@0 3016 {
michael@0 3017 loc++;
michael@0 3018 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3019 currentLen += URX_VAL(stringLenOp);
michael@0 3020 }
michael@0 3021 break;
michael@0 3022
michael@0 3023
michael@0 3024 case URX_STRING_I:
michael@0 3025 {
michael@0 3026 loc++;
michael@0 3027 // TODO: with full case folding, matching input text may be shorter than
michael@0 3028 // the string we have here. More smarts could put some bounds on it.
michael@0 3029 // Assume a min length of one for now. A min length of zero causes
michael@0 3030 // optimization failures for a pattern like "string"+
michael@0 3031 // currentLen += URX_VAL(stringLenOp);
michael@0 3032 currentLen += 1;
michael@0 3033 }
michael@0 3034 break;
michael@0 3035
michael@0 3036 case URX_CTR_INIT:
michael@0 3037 case URX_CTR_INIT_NG:
michael@0 3038 {
michael@0 3039 // Loop Init Ops.
michael@0 3040 // If the min loop count == 0
michael@0 3041 // move loc forwards to the end of the loop, skipping over the body.
michael@0 3042 // If the min count is > 0,
michael@0 3043 // continue normal processing of the body of the loop.
michael@0 3044 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
michael@0 3045 loopEndLoc = URX_VAL(loopEndLoc);
michael@0 3046 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
michael@0 3047 if (minLoopCount == 0) {
michael@0 3048 loc = loopEndLoc;
michael@0 3049 } else {
michael@0 3050 loc+=3; // Skips over operands of CTR_INIT
michael@0 3051 }
michael@0 3052 }
michael@0 3053 break;
michael@0 3054
michael@0 3055
michael@0 3056 case URX_CTR_LOOP:
michael@0 3057 case URX_CTR_LOOP_NG:
michael@0 3058 // Loop ops.
michael@0 3059 // The jump is conditional, backwards only.
michael@0 3060 break;
michael@0 3061
michael@0 3062 case URX_LOOP_SR_I:
michael@0 3063 case URX_LOOP_DOT_I:
michael@0 3064 case URX_LOOP_C:
michael@0 3065 // More loop ops. These state-save to themselves.
michael@0 3066 // don't change the minimum match - could match nothing at all.
michael@0 3067 break;
michael@0 3068
michael@0 3069
michael@0 3070 case URX_LA_START:
michael@0 3071 case URX_LB_START:
michael@0 3072 {
michael@0 3073 // Look-around. Scan forward until the matching look-ahead end,
michael@0 3074 // without processing the look-around block. This is overly pessimistic for look-ahead,
michael@0 3075 // it assumes that the look-ahead match might be zero-length.
michael@0 3076 // TODO: Positive lookahead could recursively do the block, then continue
michael@0 3077 // with the longer of the block or the value coming in. Ticket 6060
michael@0 3078 int32_t depth = (opType == URX_LA_START? 2: 1);;
michael@0 3079 for (;;) {
michael@0 3080 loc++;
michael@0 3081 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3082 if (URX_TYPE(op) == URX_LA_START) {
michael@0 3083 // The boilerplate for look-ahead includes two LA_END insturctions,
michael@0 3084 // Depth will be decremented by each one when it is seen.
michael@0 3085 depth += 2;
michael@0 3086 }
michael@0 3087 if (URX_TYPE(op) == URX_LB_START) {
michael@0 3088 depth++;
michael@0 3089 }
michael@0 3090 if (URX_TYPE(op) == URX_LA_END) {
michael@0 3091 depth--;
michael@0 3092 if (depth == 0) {
michael@0 3093 break;
michael@0 3094 }
michael@0 3095 }
michael@0 3096 if (URX_TYPE(op)==URX_LBN_END) {
michael@0 3097 depth--;
michael@0 3098 if (depth == 0) {
michael@0 3099 break;
michael@0 3100 }
michael@0 3101 }
michael@0 3102 if (URX_TYPE(op) == URX_STATE_SAVE) {
michael@0 3103 // Need this because neg lookahead blocks will FAIL to outside
michael@0 3104 // of the block.
michael@0 3105 int32_t jmpDest = URX_VAL(op);
michael@0 3106 if (jmpDest > loc) {
michael@0 3107 if (currentLen < forwardedLength.elementAti(jmpDest)) {
michael@0 3108 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 3109 }
michael@0 3110 }
michael@0 3111 }
michael@0 3112 U_ASSERT(loc <= end);
michael@0 3113 }
michael@0 3114 }
michael@0 3115 break;
michael@0 3116
michael@0 3117 case URX_LA_END:
michael@0 3118 case URX_LB_CONT:
michael@0 3119 case URX_LB_END:
michael@0 3120 case URX_LBN_CONT:
michael@0 3121 case URX_LBN_END:
michael@0 3122 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
michael@0 3123 // range being sized, which happens when measuring size of look-behind blocks.
michael@0 3124 break;
michael@0 3125
michael@0 3126 default:
michael@0 3127 U_ASSERT(FALSE);
michael@0 3128 }
michael@0 3129
michael@0 3130 }
michael@0 3131
michael@0 3132 // We have finished walking through the ops. Check whether some forward jump
michael@0 3133 // propagated a shorter length to location end+1.
michael@0 3134 if (forwardedLength.elementAti(end+1) < currentLen) {
michael@0 3135 currentLen = forwardedLength.elementAti(end+1);
michael@0 3136 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
michael@0 3137 }
michael@0 3138
michael@0 3139 return currentLen;
michael@0 3140 }
michael@0 3141
michael@0 3142 // Increment with overflow check.
michael@0 3143 // val and delta will both be positive.
michael@0 3144
michael@0 3145 static int32_t safeIncrement(int32_t val, int32_t delta) {
michael@0 3146 if (INT32_MAX - val > delta) {
michael@0 3147 return val + delta;
michael@0 3148 } else {
michael@0 3149 return INT32_MAX;
michael@0 3150 }
michael@0 3151 }
michael@0 3152
michael@0 3153
michael@0 3154 //------------------------------------------------------------------------------
michael@0 3155 //
michael@0 3156 // maxMatchLength Calculate the length of the longest string that could
michael@0 3157 // match the specified pattern.
michael@0 3158 // Length is in 16 bit code units, not code points.
michael@0 3159 //
michael@0 3160 // The calculated length may not be exact. The returned
michael@0 3161 // value may be longer than the actual maximum; it must
michael@0 3162 // never be shorter.
michael@0 3163 //
michael@0 3164 //------------------------------------------------------------------------------
michael@0 3165 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
michael@0 3166 if (U_FAILURE(*fStatus)) {
michael@0 3167 return 0;
michael@0 3168 }
michael@0 3169 U_ASSERT(start <= end);
michael@0 3170 U_ASSERT(end < fRXPat->fCompiledPat->size());
michael@0 3171
michael@0 3172
michael@0 3173 int32_t loc;
michael@0 3174 int32_t op;
michael@0 3175 int32_t opType;
michael@0 3176 int32_t currentLen = 0;
michael@0 3177 UVector32 forwardedLength(end+1, *fStatus);
michael@0 3178 forwardedLength.setSize(end+1);
michael@0 3179
michael@0 3180 for (loc=start; loc<=end; loc++) {
michael@0 3181 forwardedLength.setElementAt(0, loc);
michael@0 3182 }
michael@0 3183
michael@0 3184 for (loc = start; loc<=end; loc++) {
michael@0 3185 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3186 opType = URX_TYPE(op);
michael@0 3187
michael@0 3188 // The loop is advancing linearly through the pattern.
michael@0 3189 // If the op we are now at was the destination of a branch in the pattern,
michael@0 3190 // and that path has a longer maximum length than the current accumulated value,
michael@0 3191 // replace the current accumulated value.
michael@0 3192 if (forwardedLength.elementAti(loc) > currentLen) {
michael@0 3193 currentLen = forwardedLength.elementAti(loc);
michael@0 3194 }
michael@0 3195
michael@0 3196 switch (opType) {
michael@0 3197 // Ops that don't change the total length matched
michael@0 3198 case URX_RESERVED_OP:
michael@0 3199 case URX_END:
michael@0 3200 case URX_STRING_LEN:
michael@0 3201 case URX_NOP:
michael@0 3202 case URX_START_CAPTURE:
michael@0 3203 case URX_END_CAPTURE:
michael@0 3204 case URX_BACKSLASH_B:
michael@0 3205 case URX_BACKSLASH_BU:
michael@0 3206 case URX_BACKSLASH_G:
michael@0 3207 case URX_BACKSLASH_Z:
michael@0 3208 case URX_CARET:
michael@0 3209 case URX_DOLLAR:
michael@0 3210 case URX_DOLLAR_M:
michael@0 3211 case URX_DOLLAR_D:
michael@0 3212 case URX_DOLLAR_MD:
michael@0 3213 case URX_RELOC_OPRND:
michael@0 3214 case URX_STO_INP_LOC:
michael@0 3215 case URX_CARET_M:
michael@0 3216 case URX_CARET_M_UNIX:
michael@0 3217
michael@0 3218 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
michael@0 3219 case URX_LD_SP:
michael@0 3220
michael@0 3221 case URX_LB_END:
michael@0 3222 case URX_LB_CONT:
michael@0 3223 case URX_LBN_CONT:
michael@0 3224 case URX_LBN_END:
michael@0 3225 break;
michael@0 3226
michael@0 3227
michael@0 3228 // Ops that increase that cause an unbounded increase in the length
michael@0 3229 // of a matched string, or that increase it a hard to characterize way.
michael@0 3230 // Call the max length unbounded, and stop further checking.
michael@0 3231 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
michael@0 3232 case URX_BACKREF_I:
michael@0 3233 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
michael@0 3234 currentLen = INT32_MAX;
michael@0 3235 break;
michael@0 3236
michael@0 3237
michael@0 3238 // Ops that match a max of one character (possibly two 16 bit code units.)
michael@0 3239 //
michael@0 3240 case URX_STATIC_SETREF:
michael@0 3241 case URX_STAT_SETREF_N:
michael@0 3242 case URX_SETREF:
michael@0 3243 case URX_BACKSLASH_D:
michael@0 3244 case URX_ONECHAR_I:
michael@0 3245 case URX_DOTANY_ALL:
michael@0 3246 case URX_DOTANY:
michael@0 3247 case URX_DOTANY_UNIX:
michael@0 3248 currentLen = safeIncrement(currentLen, 2);
michael@0 3249 break;
michael@0 3250
michael@0 3251 // Single literal character. Increase current max length by one or two,
michael@0 3252 // depending on whether the char is in the supplementary range.
michael@0 3253 case URX_ONECHAR:
michael@0 3254 currentLen = safeIncrement(currentLen, 1);
michael@0 3255 if (URX_VAL(op) > 0x10000) {
michael@0 3256 currentLen = safeIncrement(currentLen, 1);
michael@0 3257 }
michael@0 3258 break;
michael@0 3259
michael@0 3260 // Jumps.
michael@0 3261 //
michael@0 3262 case URX_JMP:
michael@0 3263 case URX_JMPX:
michael@0 3264 case URX_JMP_SAV:
michael@0 3265 case URX_JMP_SAV_X:
michael@0 3266 {
michael@0 3267 int32_t jmpDest = URX_VAL(op);
michael@0 3268 if (jmpDest < loc) {
michael@0 3269 // Loop of some kind. Max match length is unbounded.
michael@0 3270 currentLen = INT32_MAX;
michael@0 3271 } else {
michael@0 3272 // Forward jump. Propagate the current min length to the target loc of the jump.
michael@0 3273 if (forwardedLength.elementAti(jmpDest) < currentLen) {
michael@0 3274 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 3275 }
michael@0 3276 currentLen = 0;
michael@0 3277 }
michael@0 3278 }
michael@0 3279 break;
michael@0 3280
michael@0 3281 case URX_BACKTRACK:
michael@0 3282 // back-tracks are kind of like a branch, except that the max length was
michael@0 3283 // propagated already, by the state save.
michael@0 3284 currentLen = forwardedLength.elementAti(loc+1);
michael@0 3285 break;
michael@0 3286
michael@0 3287
michael@0 3288 case URX_STATE_SAVE:
michael@0 3289 {
michael@0 3290 // State Save, for forward jumps, propagate the current minimum.
michael@0 3291 // of the state save.
michael@0 3292 // For backwards jumps, they create a loop, maximum
michael@0 3293 // match length is unbounded.
michael@0 3294 int32_t jmpDest = URX_VAL(op);
michael@0 3295 if (jmpDest > loc) {
michael@0 3296 if (currentLen > forwardedLength.elementAti(jmpDest)) {
michael@0 3297 forwardedLength.setElementAt(currentLen, jmpDest);
michael@0 3298 }
michael@0 3299 } else {
michael@0 3300 currentLen = INT32_MAX;
michael@0 3301 }
michael@0 3302 }
michael@0 3303 break;
michael@0 3304
michael@0 3305
michael@0 3306
michael@0 3307
michael@0 3308 case URX_STRING:
michael@0 3309 {
michael@0 3310 loc++;
michael@0 3311 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3312 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
michael@0 3313 break;
michael@0 3314 }
michael@0 3315
michael@0 3316 case URX_STRING_I:
michael@0 3317 // TODO: This code assumes that any user string that matches will be no longer
michael@0 3318 // than our compiled string, with case insensitive matching.
michael@0 3319 // Our compiled string has been case-folded already.
michael@0 3320 //
michael@0 3321 // Any matching user string will have no more code points than our
michael@0 3322 // compiled (folded) string. Folding may add code points, but
michael@0 3323 // not remove them.
michael@0 3324 //
michael@0 3325 // There is a potential problem if a supplemental code point
michael@0 3326 // case-folds to a BMP code point. In this case our compiled string
michael@0 3327 // could be shorter (in code units) than a matching user string.
michael@0 3328 //
michael@0 3329 // At this time (Unicode 6.1) there are no such characters, and this case
michael@0 3330 // is not being handled. A test, intltest regex/Bug9283, will fail if
michael@0 3331 // any problematic characters are added to Unicode.
michael@0 3332 //
michael@0 3333 // If this happens, we can make a set of the BMP chars that the
michael@0 3334 // troublesome supplementals fold to, scan our string, and bump the
michael@0 3335 // currentLen one extra for each that is found.
michael@0 3336 //
michael@0 3337 {
michael@0 3338 loc++;
michael@0 3339 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3340 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
michael@0 3341 }
michael@0 3342 break;
michael@0 3343
michael@0 3344 case URX_CTR_INIT:
michael@0 3345 case URX_CTR_INIT_NG:
michael@0 3346 // For Loops, recursively call this function on the pattern for the loop body,
michael@0 3347 // then multiply the result by the maximum loop count.
michael@0 3348 {
michael@0 3349 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
michael@0 3350 if (loopEndLoc == loc+4) {
michael@0 3351 // Loop has an empty body. No affect on max match length.
michael@0 3352 // Continue processing with code after the loop end.
michael@0 3353 loc = loopEndLoc;
michael@0 3354 break;
michael@0 3355 }
michael@0 3356
michael@0 3357 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
michael@0 3358 if (maxLoopCount == -1) {
michael@0 3359 // Unbounded Loop. No upper bound on match length.
michael@0 3360 currentLen = INT32_MAX;
michael@0 3361 break;
michael@0 3362 }
michael@0 3363
michael@0 3364 U_ASSERT(loopEndLoc >= loc+4);
michael@0 3365 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
michael@0 3366 if (blockLen == INT32_MAX) {
michael@0 3367 currentLen = blockLen;
michael@0 3368 break;
michael@0 3369 }
michael@0 3370 currentLen += blockLen * maxLoopCount;
michael@0 3371 loc = loopEndLoc;
michael@0 3372 break;
michael@0 3373 }
michael@0 3374
michael@0 3375 case URX_CTR_LOOP:
michael@0 3376 case URX_CTR_LOOP_NG:
michael@0 3377 // These opcodes will be skipped over by code for URX_CRT_INIT.
michael@0 3378 // We shouldn't encounter them here.
michael@0 3379 U_ASSERT(FALSE);
michael@0 3380 break;
michael@0 3381
michael@0 3382 case URX_LOOP_SR_I:
michael@0 3383 case URX_LOOP_DOT_I:
michael@0 3384 case URX_LOOP_C:
michael@0 3385 // For anything to do with loops, make the match length unbounded.
michael@0 3386 currentLen = INT32_MAX;
michael@0 3387 break;
michael@0 3388
michael@0 3389
michael@0 3390
michael@0 3391 case URX_LA_START:
michael@0 3392 case URX_LA_END:
michael@0 3393 // Look-ahead. Just ignore, treat the look-ahead block as if
michael@0 3394 // it were normal pattern. Gives a too-long match length,
michael@0 3395 // but good enough for now.
michael@0 3396 break;
michael@0 3397
michael@0 3398 // End of look-ahead ops should always be consumed by the processing at
michael@0 3399 // the URX_LA_START op.
michael@0 3400 // U_ASSERT(FALSE);
michael@0 3401 // break;
michael@0 3402
michael@0 3403 case URX_LB_START:
michael@0 3404 {
michael@0 3405 // Look-behind. Scan forward until the matching look-around end,
michael@0 3406 // without processing the look-behind block.
michael@0 3407 int32_t depth = 0;
michael@0 3408 for (;;) {
michael@0 3409 loc++;
michael@0 3410 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3411 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
michael@0 3412 depth++;
michael@0 3413 }
michael@0 3414 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
michael@0 3415 if (depth == 0) {
michael@0 3416 break;
michael@0 3417 }
michael@0 3418 depth--;
michael@0 3419 }
michael@0 3420 U_ASSERT(loc < end);
michael@0 3421 }
michael@0 3422 }
michael@0 3423 break;
michael@0 3424
michael@0 3425 default:
michael@0 3426 U_ASSERT(FALSE);
michael@0 3427 }
michael@0 3428
michael@0 3429
michael@0 3430 if (currentLen == INT32_MAX) {
michael@0 3431 // The maximum length is unbounded.
michael@0 3432 // Stop further processing of the pattern.
michael@0 3433 break;
michael@0 3434 }
michael@0 3435
michael@0 3436 }
michael@0 3437 return currentLen;
michael@0 3438
michael@0 3439 }
michael@0 3440
michael@0 3441
michael@0 3442 //------------------------------------------------------------------------------
michael@0 3443 //
michael@0 3444 // stripNOPs Remove any NOP operations from the compiled pattern code.
michael@0 3445 // Extra NOPs are inserted for some constructs during the initial
michael@0 3446 // code generation to provide locations that may be patched later.
michael@0 3447 // Many end up unneeded, and are removed by this function.
michael@0 3448 //
michael@0 3449 // In order to minimize the number of passes through the pattern,
michael@0 3450 // back-reference fixup is also performed here (adjusting
michael@0 3451 // back-reference operands to point to the correct frame offsets).
michael@0 3452 //
michael@0 3453 //------------------------------------------------------------------------------
michael@0 3454 void RegexCompile::stripNOPs() {
michael@0 3455
michael@0 3456 if (U_FAILURE(*fStatus)) {
michael@0 3457 return;
michael@0 3458 }
michael@0 3459
michael@0 3460 int32_t end = fRXPat->fCompiledPat->size();
michael@0 3461 UVector32 deltas(end, *fStatus);
michael@0 3462
michael@0 3463 // Make a first pass over the code, computing the amount that things
michael@0 3464 // will be offset at each location in the original code.
michael@0 3465 int32_t loc;
michael@0 3466 int32_t d = 0;
michael@0 3467 for (loc=0; loc<end; loc++) {
michael@0 3468 deltas.addElement(d, *fStatus);
michael@0 3469 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
michael@0 3470 if (URX_TYPE(op) == URX_NOP) {
michael@0 3471 d++;
michael@0 3472 }
michael@0 3473 }
michael@0 3474
michael@0 3475 UnicodeString caseStringBuffer;
michael@0 3476
michael@0 3477 // Make a second pass over the code, removing the NOPs by moving following
michael@0 3478 // code up, and patching operands that refer to code locations that
michael@0 3479 // are being moved. The array of offsets from the first step is used
michael@0 3480 // to compute the new operand values.
michael@0 3481 int32_t src;
michael@0 3482 int32_t dst = 0;
michael@0 3483 for (src=0; src<end; src++) {
michael@0 3484 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
michael@0 3485 int32_t opType = URX_TYPE(op);
michael@0 3486 switch (opType) {
michael@0 3487 case URX_NOP:
michael@0 3488 break;
michael@0 3489
michael@0 3490 case URX_STATE_SAVE:
michael@0 3491 case URX_JMP:
michael@0 3492 case URX_CTR_LOOP:
michael@0 3493 case URX_CTR_LOOP_NG:
michael@0 3494 case URX_RELOC_OPRND:
michael@0 3495 case URX_JMPX:
michael@0 3496 case URX_JMP_SAV:
michael@0 3497 case URX_JMP_SAV_X:
michael@0 3498 // These are instructions with operands that refer to code locations.
michael@0 3499 {
michael@0 3500 int32_t operandAddress = URX_VAL(op);
michael@0 3501 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
michael@0 3502 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
michael@0 3503 op = URX_BUILD(opType, fixedOperandAddress);
michael@0 3504 fRXPat->fCompiledPat->setElementAt(op, dst);
michael@0 3505 dst++;
michael@0 3506 break;
michael@0 3507 }
michael@0 3508
michael@0 3509 case URX_BACKREF:
michael@0 3510 case URX_BACKREF_I:
michael@0 3511 {
michael@0 3512 int32_t where = URX_VAL(op);
michael@0 3513 if (where > fRXPat->fGroupMap->size()) {
michael@0 3514 error(U_REGEX_INVALID_BACK_REF);
michael@0 3515 break;
michael@0 3516 }
michael@0 3517 where = fRXPat->fGroupMap->elementAti(where-1);
michael@0 3518 op = URX_BUILD(opType, where);
michael@0 3519 fRXPat->fCompiledPat->setElementAt(op, dst);
michael@0 3520 dst++;
michael@0 3521
michael@0 3522 fRXPat->fNeedsAltInput = TRUE;
michael@0 3523 break;
michael@0 3524 }
michael@0 3525 case URX_RESERVED_OP:
michael@0 3526 case URX_RESERVED_OP_N:
michael@0 3527 case URX_BACKTRACK:
michael@0 3528 case URX_END:
michael@0 3529 case URX_ONECHAR:
michael@0 3530 case URX_STRING:
michael@0 3531 case URX_STRING_LEN:
michael@0 3532 case URX_START_CAPTURE:
michael@0 3533 case URX_END_CAPTURE:
michael@0 3534 case URX_STATIC_SETREF:
michael@0 3535 case URX_STAT_SETREF_N:
michael@0 3536 case URX_SETREF:
michael@0 3537 case URX_DOTANY:
michael@0 3538 case URX_FAIL:
michael@0 3539 case URX_BACKSLASH_B:
michael@0 3540 case URX_BACKSLASH_BU:
michael@0 3541 case URX_BACKSLASH_G:
michael@0 3542 case URX_BACKSLASH_X:
michael@0 3543 case URX_BACKSLASH_Z:
michael@0 3544 case URX_DOTANY_ALL:
michael@0 3545 case URX_BACKSLASH_D:
michael@0 3546 case URX_CARET:
michael@0 3547 case URX_DOLLAR:
michael@0 3548 case URX_CTR_INIT:
michael@0 3549 case URX_CTR_INIT_NG:
michael@0 3550 case URX_DOTANY_UNIX:
michael@0 3551 case URX_STO_SP:
michael@0 3552 case URX_LD_SP:
michael@0 3553 case URX_STO_INP_LOC:
michael@0 3554 case URX_LA_START:
michael@0 3555 case URX_LA_END:
michael@0 3556 case URX_ONECHAR_I:
michael@0 3557 case URX_STRING_I:
michael@0 3558 case URX_DOLLAR_M:
michael@0 3559 case URX_CARET_M:
michael@0 3560 case URX_CARET_M_UNIX:
michael@0 3561 case URX_LB_START:
michael@0 3562 case URX_LB_CONT:
michael@0 3563 case URX_LB_END:
michael@0 3564 case URX_LBN_CONT:
michael@0 3565 case URX_LBN_END:
michael@0 3566 case URX_LOOP_SR_I:
michael@0 3567 case URX_LOOP_DOT_I:
michael@0 3568 case URX_LOOP_C:
michael@0 3569 case URX_DOLLAR_D:
michael@0 3570 case URX_DOLLAR_MD:
michael@0 3571 // These instructions are unaltered by the relocation.
michael@0 3572 fRXPat->fCompiledPat->setElementAt(op, dst);
michael@0 3573 dst++;
michael@0 3574 break;
michael@0 3575
michael@0 3576 default:
michael@0 3577 // Some op is unaccounted for.
michael@0 3578 U_ASSERT(FALSE);
michael@0 3579 error(U_REGEX_INTERNAL_ERROR);
michael@0 3580 }
michael@0 3581 }
michael@0 3582
michael@0 3583 fRXPat->fCompiledPat->setSize(dst);
michael@0 3584 }
michael@0 3585
michael@0 3586
michael@0 3587
michael@0 3588
michael@0 3589 //------------------------------------------------------------------------------
michael@0 3590 //
michael@0 3591 // Error Report a rule parse error.
michael@0 3592 // Only report it if no previous error has been recorded.
michael@0 3593 //
michael@0 3594 //------------------------------------------------------------------------------
michael@0 3595 void RegexCompile::error(UErrorCode e) {
michael@0 3596 if (U_SUCCESS(*fStatus)) {
michael@0 3597 *fStatus = e;
michael@0 3598 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
michael@0 3599 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
michael@0 3600 // int64_t. If the values of the latter are out of range for the former,
michael@0 3601 // set them to the appropriate "field not supported" values.
michael@0 3602 if (fLineNum > 0x7FFFFFFF) {
michael@0 3603 fParseErr->line = 0;
michael@0 3604 fParseErr->offset = -1;
michael@0 3605 } else if (fCharNum > 0x7FFFFFFF) {
michael@0 3606 fParseErr->line = (int32_t)fLineNum;
michael@0 3607 fParseErr->offset = -1;
michael@0 3608 } else {
michael@0 3609 fParseErr->line = (int32_t)fLineNum;
michael@0 3610 fParseErr->offset = (int32_t)fCharNum;
michael@0 3611 }
michael@0 3612
michael@0 3613 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
michael@0 3614
michael@0 3615 // Fill in the context.
michael@0 3616 // Note: extractBetween() pins supplied indicies to the string bounds.
michael@0 3617 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
michael@0 3618 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
michael@0 3619 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
michael@0 3620 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
michael@0 3621 }
michael@0 3622 }
michael@0 3623
michael@0 3624
michael@0 3625 //
michael@0 3626 // Assorted Unicode character constants.
michael@0 3627 // Numeric because there is no portable way to enter them as literals.
michael@0 3628 // (Think EBCDIC).
michael@0 3629 //
michael@0 3630 static const UChar chCR = 0x0d; // New lines, for terminating comments.
michael@0 3631 static const UChar chLF = 0x0a; // Line Feed
michael@0 3632 static const UChar chPound = 0x23; // '#', introduces a comment.
michael@0 3633 static const UChar chDigit0 = 0x30; // '0'
michael@0 3634 static const UChar chDigit7 = 0x37; // '9'
michael@0 3635 static const UChar chColon = 0x3A; // ':'
michael@0 3636 static const UChar chE = 0x45; // 'E'
michael@0 3637 static const UChar chQ = 0x51; // 'Q'
michael@0 3638 //static const UChar chN = 0x4E; // 'N'
michael@0 3639 static const UChar chP = 0x50; // 'P'
michael@0 3640 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
michael@0 3641 //static const UChar chLBracket = 0x5b; // '['
michael@0 3642 static const UChar chRBracket = 0x5d; // ']'
michael@0 3643 static const UChar chUp = 0x5e; // '^'
michael@0 3644 static const UChar chLowerP = 0x70;
michael@0 3645 static const UChar chLBrace = 0x7b; // '{'
michael@0 3646 static const UChar chRBrace = 0x7d; // '}'
michael@0 3647 static const UChar chNEL = 0x85; // NEL newline variant
michael@0 3648 static const UChar chLS = 0x2028; // Unicode Line Separator
michael@0 3649
michael@0 3650
michael@0 3651 //------------------------------------------------------------------------------
michael@0 3652 //
michael@0 3653 // nextCharLL Low Level Next Char from the regex pattern.
michael@0 3654 // Get a char from the string, keep track of input position
michael@0 3655 // for error reporting.
michael@0 3656 //
michael@0 3657 //------------------------------------------------------------------------------
michael@0 3658 UChar32 RegexCompile::nextCharLL() {
michael@0 3659 UChar32 ch;
michael@0 3660
michael@0 3661 if (fPeekChar != -1) {
michael@0 3662 ch = fPeekChar;
michael@0 3663 fPeekChar = -1;
michael@0 3664 return ch;
michael@0 3665 }
michael@0 3666
michael@0 3667 // assume we're already in the right place
michael@0 3668 ch = UTEXT_NEXT32(fRXPat->fPattern);
michael@0 3669 if (ch == U_SENTINEL) {
michael@0 3670 return ch;
michael@0 3671 }
michael@0 3672
michael@0 3673 if (ch == chCR ||
michael@0 3674 ch == chNEL ||
michael@0 3675 ch == chLS ||
michael@0 3676 (ch == chLF && fLastChar != chCR)) {
michael@0 3677 // Character is starting a new line. Bump up the line number, and
michael@0 3678 // reset the column to 0.
michael@0 3679 fLineNum++;
michael@0 3680 fCharNum=0;
michael@0 3681 }
michael@0 3682 else {
michael@0 3683 // Character is not starting a new line. Except in the case of a
michael@0 3684 // LF following a CR, increment the column position.
michael@0 3685 if (ch != chLF) {
michael@0 3686 fCharNum++;
michael@0 3687 }
michael@0 3688 }
michael@0 3689 fLastChar = ch;
michael@0 3690 return ch;
michael@0 3691 }
michael@0 3692
michael@0 3693 //------------------------------------------------------------------------------
michael@0 3694 //
michael@0 3695 // peekCharLL Low Level Character Scanning, sneak a peek at the next
michael@0 3696 // character without actually getting it.
michael@0 3697 //
michael@0 3698 //------------------------------------------------------------------------------
michael@0 3699 UChar32 RegexCompile::peekCharLL() {
michael@0 3700 if (fPeekChar == -1) {
michael@0 3701 fPeekChar = nextCharLL();
michael@0 3702 }
michael@0 3703 return fPeekChar;
michael@0 3704 }
michael@0 3705
michael@0 3706
michael@0 3707 //------------------------------------------------------------------------------
michael@0 3708 //
michael@0 3709 // nextChar for pattern scanning. At this level, we handle stripping
michael@0 3710 // out comments and processing some backslash character escapes.
michael@0 3711 // The rest of the pattern grammar is handled at the next level up.
michael@0 3712 //
michael@0 3713 //------------------------------------------------------------------------------
michael@0 3714 void RegexCompile::nextChar(RegexPatternChar &c) {
michael@0 3715
michael@0 3716 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
michael@0 3717 c.fChar = nextCharLL();
michael@0 3718 c.fQuoted = FALSE;
michael@0 3719
michael@0 3720 if (fQuoteMode) {
michael@0 3721 c.fQuoted = TRUE;
michael@0 3722 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
michael@0 3723 c.fChar == (UChar32)-1) {
michael@0 3724 fQuoteMode = FALSE; // Exit quote mode,
michael@0 3725 nextCharLL(); // discard the E
michael@0 3726 nextChar(c); // recurse to get the real next char
michael@0 3727 }
michael@0 3728 }
michael@0 3729 else if (fInBackslashQuote) {
michael@0 3730 // The current character immediately follows a '\'
michael@0 3731 // Don't check for any further escapes, just return it as-is.
michael@0 3732 // Don't set c.fQuoted, because that would prevent the state machine from
michael@0 3733 // dispatching on the character.
michael@0 3734 fInBackslashQuote = FALSE;
michael@0 3735 }
michael@0 3736 else
michael@0 3737 {
michael@0 3738 // We are not in a \Q quoted region \E of the source.
michael@0 3739 //
michael@0 3740 if (fModeFlags & UREGEX_COMMENTS) {
michael@0 3741 //
michael@0 3742 // We are in free-spacing and comments mode.
michael@0 3743 // Scan through any white space and comments, until we
michael@0 3744 // reach a significant character or the end of inut.
michael@0 3745 for (;;) {
michael@0 3746 if (c.fChar == (UChar32)-1) {
michael@0 3747 break; // End of Input
michael@0 3748 }
michael@0 3749 if (c.fChar == chPound && fEOLComments == TRUE) {
michael@0 3750 // Start of a comment. Consume the rest of it, until EOF or a new line
michael@0 3751 for (;;) {
michael@0 3752 c.fChar = nextCharLL();
michael@0 3753 if (c.fChar == (UChar32)-1 || // EOF
michael@0 3754 c.fChar == chCR ||
michael@0 3755 c.fChar == chLF ||
michael@0 3756 c.fChar == chNEL ||
michael@0 3757 c.fChar == chLS) {
michael@0 3758 break;
michael@0 3759 }
michael@0 3760 }
michael@0 3761 }
michael@0 3762 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
michael@0 3763 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
michael@0 3764 break;
michael@0 3765 }
michael@0 3766 c.fChar = nextCharLL();
michael@0 3767 }
michael@0 3768 }
michael@0 3769
michael@0 3770 //
michael@0 3771 // check for backslash escaped characters.
michael@0 3772 //
michael@0 3773 if (c.fChar == chBackSlash) {
michael@0 3774 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
michael@0 3775 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
michael@0 3776 //
michael@0 3777 // A '\' sequence that is handled by ICU's standard unescapeAt function.
michael@0 3778 // Includes \uxxxx, \n, \r, many others.
michael@0 3779 // Return the single equivalent character.
michael@0 3780 //
michael@0 3781 nextCharLL(); // get & discard the peeked char.
michael@0 3782 c.fQuoted = TRUE;
michael@0 3783
michael@0 3784 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
michael@0 3785 int32_t endIndex = (int32_t)pos;
michael@0 3786 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
michael@0 3787
michael@0 3788 if (endIndex == pos) {
michael@0 3789 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 3790 }
michael@0 3791 fCharNum += endIndex - pos;
michael@0 3792 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
michael@0 3793 } else {
michael@0 3794 int32_t offset = 0;
michael@0 3795 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
michael@0 3796
michael@0 3797 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
michael@0 3798 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
michael@0 3799
michael@0 3800 if (offset == 0) {
michael@0 3801 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 3802 } else if (context.lastOffset == offset) {
michael@0 3803 UTEXT_PREVIOUS32(fRXPat->fPattern);
michael@0 3804 } else if (context.lastOffset != offset-1) {
michael@0 3805 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
michael@0 3806 }
michael@0 3807 fCharNum += offset;
michael@0 3808 }
michael@0 3809 }
michael@0 3810 else if (peekCharLL() == chDigit0) {
michael@0 3811 // Octal Escape, using Java Regexp Conventions
michael@0 3812 // which are \0 followed by 1-3 octal digits.
michael@0 3813 // Different from ICU Unescape handling of Octal, which does not
michael@0 3814 // require the leading 0.
michael@0 3815 // Java also has the convention of only consuming 2 octal digits if
michael@0 3816 // the three digit number would be > 0xff
michael@0 3817 //
michael@0 3818 c.fChar = 0;
michael@0 3819 nextCharLL(); // Consume the initial 0.
michael@0 3820 int index;
michael@0 3821 for (index=0; index<3; index++) {
michael@0 3822 int32_t ch = peekCharLL();
michael@0 3823 if (ch<chDigit0 || ch>chDigit7) {
michael@0 3824 if (index==0) {
michael@0 3825 // \0 is not followed by any octal digits.
michael@0 3826 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
michael@0 3827 }
michael@0 3828 break;
michael@0 3829 }
michael@0 3830 c.fChar <<= 3;
michael@0 3831 c.fChar += ch&7;
michael@0 3832 if (c.fChar <= 255) {
michael@0 3833 nextCharLL();
michael@0 3834 } else {
michael@0 3835 // The last digit made the number too big. Forget we saw it.
michael@0 3836 c.fChar >>= 3;
michael@0 3837 }
michael@0 3838 }
michael@0 3839 c.fQuoted = TRUE;
michael@0 3840 }
michael@0 3841 else if (peekCharLL() == chQ) {
michael@0 3842 // "\Q" enter quote mode, which will continue until "\E"
michael@0 3843 fQuoteMode = TRUE;
michael@0 3844 nextCharLL(); // discard the 'Q'.
michael@0 3845 nextChar(c); // recurse to get the real next char.
michael@0 3846 }
michael@0 3847 else
michael@0 3848 {
michael@0 3849 // We are in a '\' escape that will be handled by the state table scanner.
michael@0 3850 // Just return the backslash, but remember that the following char is to
michael@0 3851 // be taken literally.
michael@0 3852 fInBackslashQuote = TRUE;
michael@0 3853 }
michael@0 3854 }
michael@0 3855 }
michael@0 3856
michael@0 3857 // re-enable # to end-of-line comments, in case they were disabled.
michael@0 3858 // They are disabled by the parser upon seeing '(?', but this lasts for
michael@0 3859 // the fetching of the next character only.
michael@0 3860 fEOLComments = TRUE;
michael@0 3861
michael@0 3862 // putc(c.fChar, stdout);
michael@0 3863 }
michael@0 3864
michael@0 3865
michael@0 3866
michael@0 3867 //------------------------------------------------------------------------------
michael@0 3868 //
michael@0 3869 // scanNamedChar
michael@0 3870 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
michael@0 3871 //
michael@0 3872 // The scan position will be at the 'N'. On return
michael@0 3873 // the scan position should be just after the '}'
michael@0 3874 //
michael@0 3875 // Return the UChar32
michael@0 3876 //
michael@0 3877 //------------------------------------------------------------------------------
michael@0 3878 UChar32 RegexCompile::scanNamedChar() {
michael@0 3879 if (U_FAILURE(*fStatus)) {
michael@0 3880 return 0;
michael@0 3881 }
michael@0 3882
michael@0 3883 nextChar(fC);
michael@0 3884 if (fC.fChar != chLBrace) {
michael@0 3885 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3886 return 0;
michael@0 3887 }
michael@0 3888
michael@0 3889 UnicodeString charName;
michael@0 3890 for (;;) {
michael@0 3891 nextChar(fC);
michael@0 3892 if (fC.fChar == chRBrace) {
michael@0 3893 break;
michael@0 3894 }
michael@0 3895 if (fC.fChar == -1) {
michael@0 3896 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3897 return 0;
michael@0 3898 }
michael@0 3899 charName.append(fC.fChar);
michael@0 3900 }
michael@0 3901
michael@0 3902 char name[100];
michael@0 3903 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
michael@0 3904 (uint32_t)charName.length()>=sizeof(name)) {
michael@0 3905 // All Unicode character names have only invariant characters.
michael@0 3906 // The API to get a character, given a name, accepts only char *, forcing us to convert,
michael@0 3907 // which requires this error check
michael@0 3908 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3909 return 0;
michael@0 3910 }
michael@0 3911 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
michael@0 3912
michael@0 3913 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
michael@0 3914 if (U_FAILURE(*fStatus)) {
michael@0 3915 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3916 }
michael@0 3917
michael@0 3918 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
michael@0 3919 return theChar;
michael@0 3920 }
michael@0 3921
michael@0 3922 //------------------------------------------------------------------------------
michael@0 3923 //
michael@0 3924 // scanProp Construct a UnicodeSet from the text at the current scan
michael@0 3925 // position, which will be of the form \p{whaterver}
michael@0 3926 //
michael@0 3927 // The scan position will be at the 'p' or 'P'. On return
michael@0 3928 // the scan position should be just after the '}'
michael@0 3929 //
michael@0 3930 // Return a UnicodeSet, constructed from the \P pattern,
michael@0 3931 // or NULL if the pattern is invalid.
michael@0 3932 //
michael@0 3933 //------------------------------------------------------------------------------
michael@0 3934 UnicodeSet *RegexCompile::scanProp() {
michael@0 3935 UnicodeSet *uset = NULL;
michael@0 3936
michael@0 3937 if (U_FAILURE(*fStatus)) {
michael@0 3938 return NULL;
michael@0 3939 }
michael@0 3940 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
michael@0 3941 UBool negated = (fC.fChar == chP);
michael@0 3942
michael@0 3943 UnicodeString propertyName;
michael@0 3944 nextChar(fC);
michael@0 3945 if (fC.fChar != chLBrace) {
michael@0 3946 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3947 return NULL;
michael@0 3948 }
michael@0 3949 for (;;) {
michael@0 3950 nextChar(fC);
michael@0 3951 if (fC.fChar == chRBrace) {
michael@0 3952 break;
michael@0 3953 }
michael@0 3954 if (fC.fChar == -1) {
michael@0 3955 // Hit the end of the input string without finding the closing '}'
michael@0 3956 error(U_REGEX_PROPERTY_SYNTAX);
michael@0 3957 return NULL;
michael@0 3958 }
michael@0 3959 propertyName.append(fC.fChar);
michael@0 3960 }
michael@0 3961 uset = createSetForProperty(propertyName, negated);
michael@0 3962 nextChar(fC); // Move input scan to position following the closing '}'
michael@0 3963 return uset;
michael@0 3964 }
michael@0 3965
michael@0 3966 //------------------------------------------------------------------------------
michael@0 3967 //
michael@0 3968 // scanPosixProp Construct a UnicodeSet from the text at the current scan
michael@0 3969 // position, which is expected be of the form [:property expression:]
michael@0 3970 //
michael@0 3971 // The scan position will be at the opening ':'. On return
michael@0 3972 // the scan position must be on the closing ']'
michael@0 3973 //
michael@0 3974 // Return a UnicodeSet constructed from the pattern,
michael@0 3975 // or NULL if this is not a valid POSIX-style set expression.
michael@0 3976 // If not a property expression, restore the initial scan position
michael@0 3977 // (to the opening ':')
michael@0 3978 //
michael@0 3979 // Note: the opening '[:' is not sufficient to guarantee that
michael@0 3980 // this is a [:property:] expression.
michael@0 3981 // [:'+=,] is a perfectly good ordinary set expression that
michael@0 3982 // happens to include ':' as one of its characters.
michael@0 3983 //
michael@0 3984 //------------------------------------------------------------------------------
michael@0 3985 UnicodeSet *RegexCompile::scanPosixProp() {
michael@0 3986 UnicodeSet *uset = NULL;
michael@0 3987
michael@0 3988 if (U_FAILURE(*fStatus)) {
michael@0 3989 return NULL;
michael@0 3990 }
michael@0 3991
michael@0 3992 U_ASSERT(fC.fChar == chColon);
michael@0 3993
michael@0 3994 // Save the scanner state.
michael@0 3995 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
michael@0 3996 int64_t savedScanIndex = fScanIndex;
michael@0 3997 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
michael@0 3998 UBool savedQuoteMode = fQuoteMode;
michael@0 3999 UBool savedInBackslashQuote = fInBackslashQuote;
michael@0 4000 UBool savedEOLComments = fEOLComments;
michael@0 4001 int64_t savedLineNum = fLineNum;
michael@0 4002 int64_t savedCharNum = fCharNum;
michael@0 4003 UChar32 savedLastChar = fLastChar;
michael@0 4004 UChar32 savedPeekChar = fPeekChar;
michael@0 4005 RegexPatternChar savedfC = fC;
michael@0 4006
michael@0 4007 // Scan for a closing ]. A little tricky because there are some perverse
michael@0 4008 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
michael@0 4009 // ending on the second closing ].
michael@0 4010
michael@0 4011 UnicodeString propName;
michael@0 4012 UBool negated = FALSE;
michael@0 4013
michael@0 4014 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
michael@0 4015 nextChar(fC);
michael@0 4016 if (fC.fChar == chUp) {
michael@0 4017 negated = TRUE;
michael@0 4018 nextChar(fC);
michael@0 4019 }
michael@0 4020
michael@0 4021 // Scan for the closing ":]", collecting the property name along the way.
michael@0 4022 UBool sawPropSetTerminator = FALSE;
michael@0 4023 for (;;) {
michael@0 4024 propName.append(fC.fChar);
michael@0 4025 nextChar(fC);
michael@0 4026 if (fC.fQuoted || fC.fChar == -1) {
michael@0 4027 // Escaped characters or end of input - either says this isn't a [:Property:]
michael@0 4028 break;
michael@0 4029 }
michael@0 4030 if (fC.fChar == chColon) {
michael@0 4031 nextChar(fC);
michael@0 4032 if (fC.fChar == chRBracket) {
michael@0 4033 sawPropSetTerminator = TRUE;
michael@0 4034 }
michael@0 4035 break;
michael@0 4036 }
michael@0 4037 }
michael@0 4038
michael@0 4039 if (sawPropSetTerminator) {
michael@0 4040 uset = createSetForProperty(propName, negated);
michael@0 4041 }
michael@0 4042 else
michael@0 4043 {
michael@0 4044 // No closing ":]".
michael@0 4045 // Restore the original scan position.
michael@0 4046 // The main scanner will retry the input as a normal set expression,
michael@0 4047 // not a [:Property:] expression.
michael@0 4048 fScanIndex = savedScanIndex;
michael@0 4049 fQuoteMode = savedQuoteMode;
michael@0 4050 fInBackslashQuote = savedInBackslashQuote;
michael@0 4051 fEOLComments = savedEOLComments;
michael@0 4052 fLineNum = savedLineNum;
michael@0 4053 fCharNum = savedCharNum;
michael@0 4054 fLastChar = savedLastChar;
michael@0 4055 fPeekChar = savedPeekChar;
michael@0 4056 fC = savedfC;
michael@0 4057 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
michael@0 4058 }
michael@0 4059 return uset;
michael@0 4060 }
michael@0 4061
michael@0 4062 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
michael@0 4063 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
michael@0 4064 addCategory(set, U_GC_CF_MASK, ec);
michael@0 4065 }
michael@0 4066
michael@0 4067 //
michael@0 4068 // Create a Unicode Set from a Unicode Property expression.
michael@0 4069 // This is common code underlying both \p{...} ane [:...:] expressions.
michael@0 4070 // Includes trying the Java "properties" that aren't supported as
michael@0 4071 // normal ICU UnicodeSet properties
michael@0 4072 //
michael@0 4073 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
michael@0 4074 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
michael@0 4075 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
michael@0 4076 UnicodeString setExpr;
michael@0 4077 UnicodeSet *set;
michael@0 4078 uint32_t usetFlags = 0;
michael@0 4079
michael@0 4080 if (U_FAILURE(*fStatus)) {
michael@0 4081 return NULL;
michael@0 4082 }
michael@0 4083
michael@0 4084 //
michael@0 4085 // First try the property as we received it
michael@0 4086 //
michael@0 4087 if (negated) {
michael@0 4088 setExpr.append(negSetPrefix, -1);
michael@0 4089 } else {
michael@0 4090 setExpr.append(posSetPrefix, -1);
michael@0 4091 }
michael@0 4092 setExpr.append(propName);
michael@0 4093 setExpr.append(chRBrace);
michael@0 4094 setExpr.append(chRBracket);
michael@0 4095 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
michael@0 4096 usetFlags |= USET_CASE_INSENSITIVE;
michael@0 4097 }
michael@0 4098 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
michael@0 4099 if (U_SUCCESS(*fStatus)) {
michael@0 4100 return set;
michael@0 4101 }
michael@0 4102 delete set;
michael@0 4103 set = NULL;
michael@0 4104
michael@0 4105 //
michael@0 4106 // The property as it was didn't work.
michael@0 4107
michael@0 4108 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
michael@0 4109 // or standard Java, but many other regular expression packages do recognize it.
michael@0 4110
michael@0 4111 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
michael@0 4112 *fStatus = U_ZERO_ERROR;
michael@0 4113 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
michael@0 4114 if (set == NULL) {
michael@0 4115 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 4116 return set;
michael@0 4117 }
michael@0 4118 if (negated) {
michael@0 4119 set->complement();
michael@0 4120 }
michael@0 4121 return set;
michael@0 4122 }
michael@0 4123
michael@0 4124
michael@0 4125 // Do Java fixes -
michael@0 4126 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
michael@0 4127 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
michael@0 4128 //
michael@0 4129 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
michael@0 4130 // is accepted by Java. The property part of the name is compared
michael@0 4131 // case-insenstively. The spaces must be exactly as shown, either
michael@0 4132 // all there, or all omitted, with exactly one at each position
michael@0 4133 // if they are present. From checking against JDK 1.6
michael@0 4134 //
michael@0 4135 // This code should be removed when ICU properties support the Java compatibility names
michael@0 4136 // (ICU 4.0?)
michael@0 4137 //
michael@0 4138 UnicodeString mPropName = propName;
michael@0 4139 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
michael@0 4140 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
michael@0 4141 }
michael@0 4142 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
michael@0 4143 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
michael@0 4144 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
michael@0 4145 }
michael@0 4146 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
michael@0 4147 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
michael@0 4148 }
michael@0 4149
michael@0 4150 // See if the property looks like a Java "InBlockName", which
michael@0 4151 // we will recast as "Block=BlockName"
michael@0 4152 //
michael@0 4153 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
michael@0 4154 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
michael@0 4155 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
michael@0 4156 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
michael@0 4157 setExpr.append(BLOCK, -1);
michael@0 4158 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
michael@0 4159 setExpr.append(chRBrace);
michael@0 4160 setExpr.append(chRBracket);
michael@0 4161 *fStatus = U_ZERO_ERROR;
michael@0 4162 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
michael@0 4163 if (U_SUCCESS(*fStatus)) {
michael@0 4164 return set;
michael@0 4165 }
michael@0 4166 delete set;
michael@0 4167 set = NULL;
michael@0 4168 }
michael@0 4169
michael@0 4170 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
michael@0 4171 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
michael@0 4172 {
michael@0 4173 UErrorCode localStatus = U_ZERO_ERROR;
michael@0 4174 //setExpr.remove();
michael@0 4175 set = new UnicodeSet();
michael@0 4176 //
michael@0 4177 // Try the various Java specific properties.
michael@0 4178 // These all begin with "java"
michael@0 4179 //
michael@0 4180 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
michael@0 4181 addCategory(set, U_GC_CN_MASK, localStatus);
michael@0 4182 set->complement();
michael@0 4183 }
michael@0 4184 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
michael@0 4185 addCategory(set, U_GC_ND_MASK, localStatus);
michael@0 4186 }
michael@0 4187 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
michael@0 4188 addIdentifierIgnorable(set, localStatus);
michael@0 4189 }
michael@0 4190 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
michael@0 4191 set->add(0, 0x1F).add(0x7F, 0x9F);
michael@0 4192 }
michael@0 4193 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
michael@0 4194 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4195 addCategory(set, U_GC_SC_MASK, localStatus);
michael@0 4196 addCategory(set, U_GC_PC_MASK, localStatus);
michael@0 4197 addCategory(set, U_GC_ND_MASK, localStatus);
michael@0 4198 addCategory(set, U_GC_NL_MASK, localStatus);
michael@0 4199 addCategory(set, U_GC_MC_MASK, localStatus);
michael@0 4200 addCategory(set, U_GC_MN_MASK, localStatus);
michael@0 4201 addIdentifierIgnorable(set, localStatus);
michael@0 4202 }
michael@0 4203 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
michael@0 4204 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4205 addCategory(set, U_GC_NL_MASK, localStatus);
michael@0 4206 addCategory(set, U_GC_SC_MASK, localStatus);
michael@0 4207 addCategory(set, U_GC_PC_MASK, localStatus);
michael@0 4208 }
michael@0 4209 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
michael@0 4210 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4211 }
michael@0 4212 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
michael@0 4213 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4214 addCategory(set, U_GC_ND_MASK, localStatus);
michael@0 4215 }
michael@0 4216 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
michael@0 4217 addCategory(set, U_GC_LL_MASK, localStatus);
michael@0 4218 }
michael@0 4219 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
michael@0 4220 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
michael@0 4221 }
michael@0 4222 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
michael@0 4223 addCategory(set, U_GC_Z_MASK, localStatus);
michael@0 4224 }
michael@0 4225 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
michael@0 4226 set->add(0x10000, UnicodeSet::MAX_VALUE);
michael@0 4227 }
michael@0 4228 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
michael@0 4229 addCategory(set, U_GC_LT_MASK, localStatus);
michael@0 4230 }
michael@0 4231 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
michael@0 4232 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4233 addCategory(set, U_GC_NL_MASK, localStatus);
michael@0 4234 }
michael@0 4235 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
michael@0 4236 addCategory(set, U_GC_L_MASK, localStatus);
michael@0 4237 addCategory(set, U_GC_PC_MASK, localStatus);
michael@0 4238 addCategory(set, U_GC_ND_MASK, localStatus);
michael@0 4239 addCategory(set, U_GC_NL_MASK, localStatus);
michael@0 4240 addCategory(set, U_GC_MC_MASK, localStatus);
michael@0 4241 addCategory(set, U_GC_MN_MASK, localStatus);
michael@0 4242 addIdentifierIgnorable(set, localStatus);
michael@0 4243 }
michael@0 4244 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
michael@0 4245 addCategory(set, U_GC_LU_MASK, localStatus);
michael@0 4246 }
michael@0 4247 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
michael@0 4248 set->add(0, UnicodeSet::MAX_VALUE);
michael@0 4249 }
michael@0 4250 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
michael@0 4251 addCategory(set, U_GC_Z_MASK, localStatus);
michael@0 4252 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
michael@0 4253 set->add(9, 0x0d).add(0x1c, 0x1f);
michael@0 4254 }
michael@0 4255 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
michael@0 4256 set->add(0, UnicodeSet::MAX_VALUE);
michael@0 4257 }
michael@0 4258
michael@0 4259 if (U_SUCCESS(localStatus) && !set->isEmpty()) {
michael@0 4260 *fStatus = U_ZERO_ERROR;
michael@0 4261 if (usetFlags & USET_CASE_INSENSITIVE) {
michael@0 4262 set->closeOver(USET_CASE_INSENSITIVE);
michael@0 4263 }
michael@0 4264 if (negated) {
michael@0 4265 set->complement();
michael@0 4266 }
michael@0 4267 return set;
michael@0 4268 }
michael@0 4269 delete set;
michael@0 4270 set = NULL;
michael@0 4271 }
michael@0 4272 error(*fStatus);
michael@0 4273 return NULL;
michael@0 4274 }
michael@0 4275
michael@0 4276
michael@0 4277
michael@0 4278 //
michael@0 4279 // SetEval Part of the evaluation of [set expressions].
michael@0 4280 // Perform any pending (stacked) operations with precedence
michael@0 4281 // equal or greater to that of the next operator encountered
michael@0 4282 // in the expression.
michael@0 4283 //
michael@0 4284 void RegexCompile::setEval(int32_t nextOp) {
michael@0 4285 UnicodeSet *rightOperand = NULL;
michael@0 4286 UnicodeSet *leftOperand = NULL;
michael@0 4287 for (;;) {
michael@0 4288 U_ASSERT(fSetOpStack.empty()==FALSE);
michael@0 4289 int32_t pendingSetOperation = fSetOpStack.peeki();
michael@0 4290 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
michael@0 4291 break;
michael@0 4292 }
michael@0 4293 fSetOpStack.popi();
michael@0 4294 U_ASSERT(fSetStack.empty() == FALSE);
michael@0 4295 rightOperand = (UnicodeSet *)fSetStack.peek();
michael@0 4296 switch (pendingSetOperation) {
michael@0 4297 case setNegation:
michael@0 4298 rightOperand->complement();
michael@0 4299 break;
michael@0 4300 case setCaseClose:
michael@0 4301 // TODO: need a simple close function. Ticket 6065
michael@0 4302 rightOperand->closeOver(USET_CASE_INSENSITIVE);
michael@0 4303 rightOperand->removeAllStrings();
michael@0 4304 break;
michael@0 4305 case setDifference1:
michael@0 4306 case setDifference2:
michael@0 4307 fSetStack.pop();
michael@0 4308 leftOperand = (UnicodeSet *)fSetStack.peek();
michael@0 4309 leftOperand->removeAll(*rightOperand);
michael@0 4310 delete rightOperand;
michael@0 4311 break;
michael@0 4312 case setIntersection1:
michael@0 4313 case setIntersection2:
michael@0 4314 fSetStack.pop();
michael@0 4315 leftOperand = (UnicodeSet *)fSetStack.peek();
michael@0 4316 leftOperand->retainAll(*rightOperand);
michael@0 4317 delete rightOperand;
michael@0 4318 break;
michael@0 4319 case setUnion:
michael@0 4320 fSetStack.pop();
michael@0 4321 leftOperand = (UnicodeSet *)fSetStack.peek();
michael@0 4322 leftOperand->addAll(*rightOperand);
michael@0 4323 delete rightOperand;
michael@0 4324 break;
michael@0 4325 default:
michael@0 4326 U_ASSERT(FALSE);
michael@0 4327 break;
michael@0 4328 }
michael@0 4329 }
michael@0 4330 }
michael@0 4331
michael@0 4332 void RegexCompile::setPushOp(int32_t op) {
michael@0 4333 setEval(op);
michael@0 4334 fSetOpStack.push(op, *fStatus);
michael@0 4335 fSetStack.push(new UnicodeSet(), *fStatus);
michael@0 4336 }
michael@0 4337
michael@0 4338 U_NAMESPACE_END
michael@0 4339 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 4340

mercurial