1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/rematch.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,5690 @@ 1.4 +/* 1.5 +************************************************************************** 1.6 +* Copyright (C) 2002-2013 International Business Machines Corporation * 1.7 +* and others. All rights reserved. * 1.8 +************************************************************************** 1.9 +*/ 1.10 +// 1.11 +// file: rematch.cpp 1.12 +// 1.13 +// Contains the implementation of class RegexMatcher, 1.14 +// which is one of the main API classes for the ICU regular expression package. 1.15 +// 1.16 + 1.17 +#include "unicode/utypes.h" 1.18 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.19 + 1.20 +#include "unicode/regex.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/uchar.h" 1.23 +#include "unicode/ustring.h" 1.24 +#include "unicode/rbbi.h" 1.25 +#include "unicode/utf.h" 1.26 +#include "unicode/utf16.h" 1.27 +#include "uassert.h" 1.28 +#include "cmemory.h" 1.29 +#include "uvector.h" 1.30 +#include "uvectr32.h" 1.31 +#include "uvectr64.h" 1.32 +#include "regeximp.h" 1.33 +#include "regexst.h" 1.34 +#include "regextxt.h" 1.35 +#include "ucase.h" 1.36 + 1.37 +// #include <malloc.h> // Needed for heapcheck testing 1.38 + 1.39 + 1.40 +// Find progress callback 1.41 +// ---------------------- 1.42 +// Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 1.43 +// 1.44 +#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 1.45 + (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 1.46 + 1.47 + 1.48 +// Smart Backtracking 1.49 +// ------------------ 1.50 +// When a failure would go back to a LOOP_C instruction, 1.51 +// strings, characters, and setrefs scan backwards for a valid start 1.52 +// character themselves, pop the stack, and save state, emulating the 1.53 +// LOOP_C's effect but assured that the next character of input is a 1.54 +// possible matching character. 1.55 +// 1.56 +// Good idea in theory; unfortunately it only helps out a few specific 1.57 +// cases and slows the engine down a little in the rest. 1.58 + 1.59 +U_NAMESPACE_BEGIN 1.60 + 1.61 +// Default limit for the size of the back track stack, to avoid system 1.62 +// failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 1.63 +// This value puts ICU's limits higher than most other regexp implementations, 1.64 +// which use recursion rather than the heap, and take more storage per 1.65 +// backtrack point. 1.66 +// 1.67 +static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 1.68 + 1.69 +// Time limit counter constant. 1.70 +// Time limits for expression evaluation are in terms of quanta of work by 1.71 +// the engine, each of which is 10,000 state saves. 1.72 +// This constant determines that state saves per tick number. 1.73 +static const int32_t TIMER_INITIAL_VALUE = 10000; 1.74 + 1.75 +//----------------------------------------------------------------------------- 1.76 +// 1.77 +// Constructor and Destructor 1.78 +// 1.79 +//----------------------------------------------------------------------------- 1.80 +RegexMatcher::RegexMatcher(const RegexPattern *pat) { 1.81 + fDeferredStatus = U_ZERO_ERROR; 1.82 + init(fDeferredStatus); 1.83 + if (U_FAILURE(fDeferredStatus)) { 1.84 + return; 1.85 + } 1.86 + if (pat==NULL) { 1.87 + fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 1.88 + return; 1.89 + } 1.90 + fPattern = pat; 1.91 + init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 1.92 +} 1.93 + 1.94 + 1.95 + 1.96 +RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 1.97 + uint32_t flags, UErrorCode &status) { 1.98 + init(status); 1.99 + if (U_FAILURE(status)) { 1.100 + return; 1.101 + } 1.102 + UParseError pe; 1.103 + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 1.104 + fPattern = fPatternOwned; 1.105 + 1.106 + UText inputText = UTEXT_INITIALIZER; 1.107 + utext_openConstUnicodeString(&inputText, &input, &status); 1.108 + init2(&inputText, status); 1.109 + utext_close(&inputText); 1.110 + 1.111 + fInputUniStrMaybeMutable = TRUE; 1.112 +} 1.113 + 1.114 + 1.115 +RegexMatcher::RegexMatcher(UText *regexp, UText *input, 1.116 + uint32_t flags, UErrorCode &status) { 1.117 + init(status); 1.118 + if (U_FAILURE(status)) { 1.119 + return; 1.120 + } 1.121 + UParseError pe; 1.122 + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 1.123 + if (U_FAILURE(status)) { 1.124 + return; 1.125 + } 1.126 + 1.127 + fPattern = fPatternOwned; 1.128 + init2(input, status); 1.129 +} 1.130 + 1.131 + 1.132 +RegexMatcher::RegexMatcher(const UnicodeString ®exp, 1.133 + uint32_t flags, UErrorCode &status) { 1.134 + init(status); 1.135 + if (U_FAILURE(status)) { 1.136 + return; 1.137 + } 1.138 + UParseError pe; 1.139 + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 1.140 + if (U_FAILURE(status)) { 1.141 + return; 1.142 + } 1.143 + fPattern = fPatternOwned; 1.144 + init2(RegexStaticSets::gStaticSets->fEmptyText, status); 1.145 +} 1.146 + 1.147 +RegexMatcher::RegexMatcher(UText *regexp, 1.148 + uint32_t flags, UErrorCode &status) { 1.149 + init(status); 1.150 + if (U_FAILURE(status)) { 1.151 + return; 1.152 + } 1.153 + UParseError pe; 1.154 + fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 1.155 + if (U_FAILURE(status)) { 1.156 + return; 1.157 + } 1.158 + 1.159 + fPattern = fPatternOwned; 1.160 + init2(RegexStaticSets::gStaticSets->fEmptyText, status); 1.161 +} 1.162 + 1.163 + 1.164 + 1.165 + 1.166 +RegexMatcher::~RegexMatcher() { 1.167 + delete fStack; 1.168 + if (fData != fSmallData) { 1.169 + uprv_free(fData); 1.170 + fData = NULL; 1.171 + } 1.172 + if (fPatternOwned) { 1.173 + delete fPatternOwned; 1.174 + fPatternOwned = NULL; 1.175 + fPattern = NULL; 1.176 + } 1.177 + 1.178 + if (fInput) { 1.179 + delete fInput; 1.180 + } 1.181 + if (fInputText) { 1.182 + utext_close(fInputText); 1.183 + } 1.184 + if (fAltInputText) { 1.185 + utext_close(fAltInputText); 1.186 + } 1.187 + 1.188 + #if UCONFIG_NO_BREAK_ITERATION==0 1.189 + delete fWordBreakItr; 1.190 + #endif 1.191 +} 1.192 + 1.193 +// 1.194 +// init() common initialization for use by all constructors. 1.195 +// Initialize all fields, get the object into a consistent state. 1.196 +// This must be done even when the initial status shows an error, 1.197 +// so that the object is initialized sufficiently well for the destructor 1.198 +// to run safely. 1.199 +// 1.200 +void RegexMatcher::init(UErrorCode &status) { 1.201 + fPattern = NULL; 1.202 + fPatternOwned = NULL; 1.203 + fFrameSize = 0; 1.204 + fRegionStart = 0; 1.205 + fRegionLimit = 0; 1.206 + fAnchorStart = 0; 1.207 + fAnchorLimit = 0; 1.208 + fLookStart = 0; 1.209 + fLookLimit = 0; 1.210 + fActiveStart = 0; 1.211 + fActiveLimit = 0; 1.212 + fTransparentBounds = FALSE; 1.213 + fAnchoringBounds = TRUE; 1.214 + fMatch = FALSE; 1.215 + fMatchStart = 0; 1.216 + fMatchEnd = 0; 1.217 + fLastMatchEnd = -1; 1.218 + fAppendPosition = 0; 1.219 + fHitEnd = FALSE; 1.220 + fRequireEnd = FALSE; 1.221 + fStack = NULL; 1.222 + fFrame = NULL; 1.223 + fTimeLimit = 0; 1.224 + fTime = 0; 1.225 + fTickCounter = 0; 1.226 + fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 1.227 + fCallbackFn = NULL; 1.228 + fCallbackContext = NULL; 1.229 + fFindProgressCallbackFn = NULL; 1.230 + fFindProgressCallbackContext = NULL; 1.231 + fTraceDebug = FALSE; 1.232 + fDeferredStatus = status; 1.233 + fData = fSmallData; 1.234 + fWordBreakItr = NULL; 1.235 + 1.236 + fStack = NULL; 1.237 + fInputText = NULL; 1.238 + fAltInputText = NULL; 1.239 + fInput = NULL; 1.240 + fInputLength = 0; 1.241 + fInputUniStrMaybeMutable = FALSE; 1.242 + 1.243 + if (U_FAILURE(status)) { 1.244 + fDeferredStatus = status; 1.245 + } 1.246 +} 1.247 + 1.248 +// 1.249 +// init2() Common initialization for use by RegexMatcher constructors, part 2. 1.250 +// This handles the common setup to be done after the Pattern is available. 1.251 +// 1.252 +void RegexMatcher::init2(UText *input, UErrorCode &status) { 1.253 + if (U_FAILURE(status)) { 1.254 + fDeferredStatus = status; 1.255 + return; 1.256 + } 1.257 + 1.258 + if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 1.259 + fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 1.260 + if (fData == NULL) { 1.261 + status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 1.262 + return; 1.263 + } 1.264 + } 1.265 + 1.266 + fStack = new UVector64(status); 1.267 + if (fStack == NULL) { 1.268 + status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 1.269 + return; 1.270 + } 1.271 + 1.272 + reset(input); 1.273 + setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 1.274 + if (U_FAILURE(status)) { 1.275 + fDeferredStatus = status; 1.276 + return; 1.277 + } 1.278 +} 1.279 + 1.280 + 1.281 +static const UChar BACKSLASH = 0x5c; 1.282 +static const UChar DOLLARSIGN = 0x24; 1.283 +//-------------------------------------------------------------------------------- 1.284 +// 1.285 +// appendReplacement 1.286 +// 1.287 +//-------------------------------------------------------------------------------- 1.288 +RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 1.289 + const UnicodeString &replacement, 1.290 + UErrorCode &status) { 1.291 + UText replacementText = UTEXT_INITIALIZER; 1.292 + 1.293 + utext_openConstUnicodeString(&replacementText, &replacement, &status); 1.294 + if (U_SUCCESS(status)) { 1.295 + UText resultText = UTEXT_INITIALIZER; 1.296 + utext_openUnicodeString(&resultText, &dest, &status); 1.297 + 1.298 + if (U_SUCCESS(status)) { 1.299 + appendReplacement(&resultText, &replacementText, status); 1.300 + utext_close(&resultText); 1.301 + } 1.302 + utext_close(&replacementText); 1.303 + } 1.304 + 1.305 + return *this; 1.306 +} 1.307 + 1.308 +// 1.309 +// appendReplacement, UText mode 1.310 +// 1.311 +RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 1.312 + UText *replacement, 1.313 + UErrorCode &status) { 1.314 + if (U_FAILURE(status)) { 1.315 + return *this; 1.316 + } 1.317 + if (U_FAILURE(fDeferredStatus)) { 1.318 + status = fDeferredStatus; 1.319 + return *this; 1.320 + } 1.321 + if (fMatch == FALSE) { 1.322 + status = U_REGEX_INVALID_STATE; 1.323 + return *this; 1.324 + } 1.325 + 1.326 + // Copy input string from the end of previous match to start of current match 1.327 + int64_t destLen = utext_nativeLength(dest); 1.328 + if (fMatchStart > fAppendPosition) { 1.329 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.330 + destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 1.331 + (int32_t)(fMatchStart-fAppendPosition), &status); 1.332 + } else { 1.333 + int32_t len16; 1.334 + if (UTEXT_USES_U16(fInputText)) { 1.335 + len16 = (int32_t)(fMatchStart-fAppendPosition); 1.336 + } else { 1.337 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.338 + len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 1.339 + } 1.340 + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1.341 + if (inputChars == NULL) { 1.342 + status = U_MEMORY_ALLOCATION_ERROR; 1.343 + return *this; 1.344 + } 1.345 + utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 1.346 + destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 1.347 + uprv_free(inputChars); 1.348 + } 1.349 + } 1.350 + fAppendPosition = fMatchEnd; 1.351 + 1.352 + 1.353 + // scan the replacement text, looking for substitutions ($n) and \escapes. 1.354 + // TODO: optimize this loop by efficiently scanning for '$' or '\', 1.355 + // move entire ranges not containing substitutions. 1.356 + UTEXT_SETNATIVEINDEX(replacement, 0); 1.357 + UChar32 c = UTEXT_NEXT32(replacement); 1.358 + while (c != U_SENTINEL) { 1.359 + if (c == BACKSLASH) { 1.360 + // Backslash Escape. Copy the following char out without further checks. 1.361 + // Note: Surrogate pairs don't need any special handling 1.362 + // The second half wont be a '$' or a '\', and 1.363 + // will move to the dest normally on the next 1.364 + // loop iteration. 1.365 + c = UTEXT_CURRENT32(replacement); 1.366 + if (c == U_SENTINEL) { 1.367 + break; 1.368 + } 1.369 + 1.370 + if (c==0x55/*U*/ || c==0x75/*u*/) { 1.371 + // We have a \udddd or \Udddddddd escape sequence. 1.372 + int32_t offset = 0; 1.373 + struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 1.374 + UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 1.375 + if (escapedChar != (UChar32)0xFFFFFFFF) { 1.376 + if (U_IS_BMP(escapedChar)) { 1.377 + UChar c16 = (UChar)escapedChar; 1.378 + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 1.379 + } else { 1.380 + UChar surrogate[2]; 1.381 + surrogate[0] = U16_LEAD(escapedChar); 1.382 + surrogate[1] = U16_TRAIL(escapedChar); 1.383 + if (U_SUCCESS(status)) { 1.384 + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 1.385 + } 1.386 + } 1.387 + // TODO: Report errors for mal-formed \u escapes? 1.388 + // As this is, the original sequence is output, which may be OK. 1.389 + if (context.lastOffset == offset) { 1.390 + (void)UTEXT_PREVIOUS32(replacement); 1.391 + } else if (context.lastOffset != offset-1) { 1.392 + utext_moveIndex32(replacement, offset - context.lastOffset - 1); 1.393 + } 1.394 + } 1.395 + } else { 1.396 + (void)UTEXT_NEXT32(replacement); 1.397 + // Plain backslash escape. Just put out the escaped character. 1.398 + if (U_IS_BMP(c)) { 1.399 + UChar c16 = (UChar)c; 1.400 + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 1.401 + } else { 1.402 + UChar surrogate[2]; 1.403 + surrogate[0] = U16_LEAD(c); 1.404 + surrogate[1] = U16_TRAIL(c); 1.405 + if (U_SUCCESS(status)) { 1.406 + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 1.407 + } 1.408 + } 1.409 + } 1.410 + } else if (c != DOLLARSIGN) { 1.411 + // Normal char, not a $. Copy it out without further checks. 1.412 + if (U_IS_BMP(c)) { 1.413 + UChar c16 = (UChar)c; 1.414 + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 1.415 + } else { 1.416 + UChar surrogate[2]; 1.417 + surrogate[0] = U16_LEAD(c); 1.418 + surrogate[1] = U16_TRAIL(c); 1.419 + if (U_SUCCESS(status)) { 1.420 + destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 1.421 + } 1.422 + } 1.423 + } else { 1.424 + // We've got a $. Pick up a capture group number if one follows. 1.425 + // Consume at most the number of digits necessary for the largest capture 1.426 + // number that is valid for this pattern. 1.427 + 1.428 + int32_t numDigits = 0; 1.429 + int32_t groupNum = 0; 1.430 + UChar32 digitC; 1.431 + for (;;) { 1.432 + digitC = UTEXT_CURRENT32(replacement); 1.433 + if (digitC == U_SENTINEL) { 1.434 + break; 1.435 + } 1.436 + if (u_isdigit(digitC) == FALSE) { 1.437 + break; 1.438 + } 1.439 + (void)UTEXT_NEXT32(replacement); 1.440 + groupNum=groupNum*10 + u_charDigitValue(digitC); 1.441 + numDigits++; 1.442 + if (numDigits >= fPattern->fMaxCaptureDigits) { 1.443 + break; 1.444 + } 1.445 + } 1.446 + 1.447 + 1.448 + if (numDigits == 0) { 1.449 + // The $ didn't introduce a group number at all. 1.450 + // Treat it as just part of the substitution text. 1.451 + UChar c16 = DOLLARSIGN; 1.452 + destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 1.453 + } else { 1.454 + // Finally, append the capture group data to the destination. 1.455 + destLen += appendGroup(groupNum, dest, status); 1.456 + if (U_FAILURE(status)) { 1.457 + // Can fail if group number is out of range. 1.458 + break; 1.459 + } 1.460 + } 1.461 + } 1.462 + 1.463 + if (U_FAILURE(status)) { 1.464 + break; 1.465 + } else { 1.466 + c = UTEXT_NEXT32(replacement); 1.467 + } 1.468 + } 1.469 + 1.470 + return *this; 1.471 +} 1.472 + 1.473 + 1.474 + 1.475 +//-------------------------------------------------------------------------------- 1.476 +// 1.477 +// appendTail Intended to be used in conjunction with appendReplacement() 1.478 +// To the destination string, append everything following 1.479 +// the last match position from the input string. 1.480 +// 1.481 +// Note: Match ranges do not affect appendTail or appendReplacement 1.482 +// 1.483 +//-------------------------------------------------------------------------------- 1.484 +UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 1.485 + UErrorCode status = U_ZERO_ERROR; 1.486 + UText resultText = UTEXT_INITIALIZER; 1.487 + utext_openUnicodeString(&resultText, &dest, &status); 1.488 + 1.489 + if (U_SUCCESS(status)) { 1.490 + appendTail(&resultText, status); 1.491 + utext_close(&resultText); 1.492 + } 1.493 + 1.494 + return dest; 1.495 +} 1.496 + 1.497 +// 1.498 +// appendTail, UText mode 1.499 +// 1.500 +UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 1.501 + UBool bailOut = FALSE; 1.502 + if (U_FAILURE(status)) { 1.503 + bailOut = TRUE; 1.504 + } 1.505 + if (U_FAILURE(fDeferredStatus)) { 1.506 + status = fDeferredStatus; 1.507 + bailOut = TRUE; 1.508 + } 1.509 + 1.510 + if (bailOut) { 1.511 + // dest must not be NULL 1.512 + if (dest) { 1.513 + utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); 1.514 + return dest; 1.515 + } 1.516 + } 1.517 + 1.518 + if (fInputLength > fAppendPosition) { 1.519 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.520 + int64_t destLen = utext_nativeLength(dest); 1.521 + utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 1.522 + (int32_t)(fInputLength-fAppendPosition), &status); 1.523 + } else { 1.524 + int32_t len16; 1.525 + if (UTEXT_USES_U16(fInputText)) { 1.526 + len16 = (int32_t)(fInputLength-fAppendPosition); 1.527 + } else { 1.528 + len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 1.529 + status = U_ZERO_ERROR; // buffer overflow 1.530 + } 1.531 + 1.532 + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 1.533 + if (inputChars == NULL) { 1.534 + fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 1.535 + } else { 1.536 + utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 1.537 + int64_t destLen = utext_nativeLength(dest); 1.538 + utext_replace(dest, destLen, destLen, inputChars, len16, &status); 1.539 + uprv_free(inputChars); 1.540 + } 1.541 + } 1.542 + } 1.543 + return dest; 1.544 +} 1.545 + 1.546 + 1.547 + 1.548 +//-------------------------------------------------------------------------------- 1.549 +// 1.550 +// end 1.551 +// 1.552 +//-------------------------------------------------------------------------------- 1.553 +int32_t RegexMatcher::end(UErrorCode &err) const { 1.554 + return end(0, err); 1.555 +} 1.556 + 1.557 +int64_t RegexMatcher::end64(UErrorCode &err) const { 1.558 + return end64(0, err); 1.559 +} 1.560 + 1.561 +int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 1.562 + if (U_FAILURE(err)) { 1.563 + return -1; 1.564 + } 1.565 + if (fMatch == FALSE) { 1.566 + err = U_REGEX_INVALID_STATE; 1.567 + return -1; 1.568 + } 1.569 + if (group < 0 || group > fPattern->fGroupMap->size()) { 1.570 + err = U_INDEX_OUTOFBOUNDS_ERROR; 1.571 + return -1; 1.572 + } 1.573 + int64_t e = -1; 1.574 + if (group == 0) { 1.575 + e = fMatchEnd; 1.576 + } else { 1.577 + // Get the position within the stack frame of the variables for 1.578 + // this capture group. 1.579 + int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 1.580 + U_ASSERT(groupOffset < fPattern->fFrameSize); 1.581 + U_ASSERT(groupOffset >= 0); 1.582 + e = fFrame->fExtra[groupOffset + 1]; 1.583 + } 1.584 + 1.585 + return e; 1.586 +} 1.587 + 1.588 +int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 1.589 + return (int32_t)end64(group, err); 1.590 +} 1.591 + 1.592 + 1.593 +//-------------------------------------------------------------------------------- 1.594 +// 1.595 +// find() 1.596 +// 1.597 +//-------------------------------------------------------------------------------- 1.598 +UBool RegexMatcher::find() { 1.599 + // Start at the position of the last match end. (Will be zero if the 1.600 + // matcher has been reset.) 1.601 + // 1.602 + if (U_FAILURE(fDeferredStatus)) { 1.603 + return FALSE; 1.604 + } 1.605 + 1.606 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.607 + return findUsingChunk(); 1.608 + } 1.609 + 1.610 + int64_t startPos = fMatchEnd; 1.611 + if (startPos==0) { 1.612 + startPos = fActiveStart; 1.613 + } 1.614 + 1.615 + if (fMatch) { 1.616 + // Save the position of any previous successful match. 1.617 + fLastMatchEnd = fMatchEnd; 1.618 + 1.619 + if (fMatchStart == fMatchEnd) { 1.620 + // Previous match had zero length. Move start position up one position 1.621 + // to avoid sending find() into a loop on zero-length matches. 1.622 + if (startPos >= fActiveLimit) { 1.623 + fMatch = FALSE; 1.624 + fHitEnd = TRUE; 1.625 + return FALSE; 1.626 + } 1.627 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.628 + (void)UTEXT_NEXT32(fInputText); 1.629 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.630 + } 1.631 + } else { 1.632 + if (fLastMatchEnd >= 0) { 1.633 + // A previous find() failed to match. Don't try again. 1.634 + // (without this test, a pattern with a zero-length match 1.635 + // could match again at the end of an input string.) 1.636 + fHitEnd = TRUE; 1.637 + return FALSE; 1.638 + } 1.639 + } 1.640 + 1.641 + 1.642 + // Compute the position in the input string beyond which a match can not begin, because 1.643 + // the minimum length match would extend past the end of the input. 1.644 + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 1.645 + // Be aware of possible overflows if making changes here. 1.646 + int64_t testStartLimit; 1.647 + if (UTEXT_USES_U16(fInputText)) { 1.648 + testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 1.649 + if (startPos > testStartLimit) { 1.650 + fMatch = FALSE; 1.651 + fHitEnd = TRUE; 1.652 + return FALSE; 1.653 + } 1.654 + } else { 1.655 + // For now, let the matcher discover that it can't match on its own 1.656 + // We don't know how long the match len is in native characters 1.657 + testStartLimit = fActiveLimit; 1.658 + } 1.659 + 1.660 + UChar32 c; 1.661 + U_ASSERT(startPos >= 0); 1.662 + 1.663 + switch (fPattern->fStartType) { 1.664 + case START_NO_INFO: 1.665 + // No optimization was found. 1.666 + // Try a match at each input position. 1.667 + for (;;) { 1.668 + MatchAt(startPos, FALSE, fDeferredStatus); 1.669 + if (U_FAILURE(fDeferredStatus)) { 1.670 + return FALSE; 1.671 + } 1.672 + if (fMatch) { 1.673 + return TRUE; 1.674 + } 1.675 + if (startPos >= testStartLimit) { 1.676 + fHitEnd = TRUE; 1.677 + return FALSE; 1.678 + } 1.679 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.680 + (void)UTEXT_NEXT32(fInputText); 1.681 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.682 + // Note that it's perfectly OK for a pattern to have a zero-length 1.683 + // match at the end of a string, so we must make sure that the loop 1.684 + // runs with startPos == testStartLimit the last time through. 1.685 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.686 + return FALSE; 1.687 + } 1.688 + U_ASSERT(FALSE); 1.689 + 1.690 + case START_START: 1.691 + // Matches are only possible at the start of the input string 1.692 + // (pattern begins with ^ or \A) 1.693 + if (startPos > fActiveStart) { 1.694 + fMatch = FALSE; 1.695 + return FALSE; 1.696 + } 1.697 + MatchAt(startPos, FALSE, fDeferredStatus); 1.698 + if (U_FAILURE(fDeferredStatus)) { 1.699 + return FALSE; 1.700 + } 1.701 + return fMatch; 1.702 + 1.703 + 1.704 + case START_SET: 1.705 + { 1.706 + // Match may start on any char from a pre-computed set. 1.707 + U_ASSERT(fPattern->fMinMatchLen > 0); 1.708 + int64_t pos; 1.709 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.710 + for (;;) { 1.711 + c = UTEXT_NEXT32(fInputText); 1.712 + pos = UTEXT_GETNATIVEINDEX(fInputText); 1.713 + // c will be -1 (U_SENTINEL) at end of text, in which case we 1.714 + // skip this next block (so we don't have a negative array index) 1.715 + // and handle end of text in the following block. 1.716 + if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 1.717 + (c>=256 && fPattern->fInitialChars->contains(c)))) { 1.718 + MatchAt(startPos, FALSE, fDeferredStatus); 1.719 + if (U_FAILURE(fDeferredStatus)) { 1.720 + return FALSE; 1.721 + } 1.722 + if (fMatch) { 1.723 + return TRUE; 1.724 + } 1.725 + UTEXT_SETNATIVEINDEX(fInputText, pos); 1.726 + } 1.727 + if (startPos >= testStartLimit) { 1.728 + fMatch = FALSE; 1.729 + fHitEnd = TRUE; 1.730 + return FALSE; 1.731 + } 1.732 + startPos = pos; 1.733 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.734 + return FALSE; 1.735 + } 1.736 + } 1.737 + U_ASSERT(FALSE); 1.738 + 1.739 + case START_STRING: 1.740 + case START_CHAR: 1.741 + { 1.742 + // Match starts on exactly one char. 1.743 + U_ASSERT(fPattern->fMinMatchLen > 0); 1.744 + UChar32 theChar = fPattern->fInitialChar; 1.745 + int64_t pos; 1.746 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.747 + for (;;) { 1.748 + c = UTEXT_NEXT32(fInputText); 1.749 + pos = UTEXT_GETNATIVEINDEX(fInputText); 1.750 + if (c == theChar) { 1.751 + MatchAt(startPos, FALSE, fDeferredStatus); 1.752 + if (U_FAILURE(fDeferredStatus)) { 1.753 + return FALSE; 1.754 + } 1.755 + if (fMatch) { 1.756 + return TRUE; 1.757 + } 1.758 + UTEXT_SETNATIVEINDEX(fInputText, pos); 1.759 + } 1.760 + if (startPos >= testStartLimit) { 1.761 + fMatch = FALSE; 1.762 + fHitEnd = TRUE; 1.763 + return FALSE; 1.764 + } 1.765 + startPos = pos; 1.766 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.767 + return FALSE; 1.768 + } 1.769 + } 1.770 + U_ASSERT(FALSE); 1.771 + 1.772 + case START_LINE: 1.773 + { 1.774 + UChar32 c; 1.775 + if (startPos == fAnchorStart) { 1.776 + MatchAt(startPos, FALSE, fDeferredStatus); 1.777 + if (U_FAILURE(fDeferredStatus)) { 1.778 + return FALSE; 1.779 + } 1.780 + if (fMatch) { 1.781 + return TRUE; 1.782 + } 1.783 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.784 + c = UTEXT_NEXT32(fInputText); 1.785 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.786 + } else { 1.787 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.788 + c = UTEXT_PREVIOUS32(fInputText); 1.789 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.790 + } 1.791 + 1.792 + if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1.793 + for (;;) { 1.794 + if (c == 0x0a) { 1.795 + MatchAt(startPos, FALSE, fDeferredStatus); 1.796 + if (U_FAILURE(fDeferredStatus)) { 1.797 + return FALSE; 1.798 + } 1.799 + if (fMatch) { 1.800 + return TRUE; 1.801 + } 1.802 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.803 + } 1.804 + if (startPos >= testStartLimit) { 1.805 + fMatch = FALSE; 1.806 + fHitEnd = TRUE; 1.807 + return FALSE; 1.808 + } 1.809 + c = UTEXT_NEXT32(fInputText); 1.810 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.811 + // Note that it's perfectly OK for a pattern to have a zero-length 1.812 + // match at the end of a string, so we must make sure that the loop 1.813 + // runs with startPos == testStartLimit the last time through. 1.814 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.815 + return FALSE; 1.816 + } 1.817 + } else { 1.818 + for (;;) { 1.819 + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1.820 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1.821 + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 1.822 + (void)UTEXT_NEXT32(fInputText); 1.823 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.824 + } 1.825 + MatchAt(startPos, FALSE, fDeferredStatus); 1.826 + if (U_FAILURE(fDeferredStatus)) { 1.827 + return FALSE; 1.828 + } 1.829 + if (fMatch) { 1.830 + return TRUE; 1.831 + } 1.832 + UTEXT_SETNATIVEINDEX(fInputText, startPos); 1.833 + } 1.834 + if (startPos >= testStartLimit) { 1.835 + fMatch = FALSE; 1.836 + fHitEnd = TRUE; 1.837 + return FALSE; 1.838 + } 1.839 + c = UTEXT_NEXT32(fInputText); 1.840 + startPos = UTEXT_GETNATIVEINDEX(fInputText); 1.841 + // Note that it's perfectly OK for a pattern to have a zero-length 1.842 + // match at the end of a string, so we must make sure that the loop 1.843 + // runs with startPos == testStartLimit the last time through. 1.844 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.845 + return FALSE; 1.846 + } 1.847 + } 1.848 + } 1.849 + 1.850 + default: 1.851 + U_ASSERT(FALSE); 1.852 + } 1.853 + 1.854 + U_ASSERT(FALSE); 1.855 + return FALSE; 1.856 +} 1.857 + 1.858 + 1.859 + 1.860 +UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 1.861 + if (U_FAILURE(status)) { 1.862 + return FALSE; 1.863 + } 1.864 + if (U_FAILURE(fDeferredStatus)) { 1.865 + status = fDeferredStatus; 1.866 + return FALSE; 1.867 + } 1.868 + this->reset(); // Note: Reset() is specified by Java Matcher documentation. 1.869 + // This will reset the region to be the full input length. 1.870 + if (start < 0) { 1.871 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.872 + return FALSE; 1.873 + } 1.874 + 1.875 + int64_t nativeStart = start; 1.876 + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1.877 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.878 + return FALSE; 1.879 + } 1.880 + fMatchEnd = nativeStart; 1.881 + return find(); 1.882 +} 1.883 + 1.884 + 1.885 +//-------------------------------------------------------------------------------- 1.886 +// 1.887 +// findUsingChunk() -- like find(), but with the advance knowledge that the 1.888 +// entire string is available in the UText's chunk buffer. 1.889 +// 1.890 +//-------------------------------------------------------------------------------- 1.891 +UBool RegexMatcher::findUsingChunk() { 1.892 + // Start at the position of the last match end. (Will be zero if the 1.893 + // matcher has been reset. 1.894 + // 1.895 + 1.896 + int32_t startPos = (int32_t)fMatchEnd; 1.897 + if (startPos==0) { 1.898 + startPos = (int32_t)fActiveStart; 1.899 + } 1.900 + 1.901 + const UChar *inputBuf = fInputText->chunkContents; 1.902 + 1.903 + if (fMatch) { 1.904 + // Save the position of any previous successful match. 1.905 + fLastMatchEnd = fMatchEnd; 1.906 + 1.907 + if (fMatchStart == fMatchEnd) { 1.908 + // Previous match had zero length. Move start position up one position 1.909 + // to avoid sending find() into a loop on zero-length matches. 1.910 + if (startPos >= fActiveLimit) { 1.911 + fMatch = FALSE; 1.912 + fHitEnd = TRUE; 1.913 + return FALSE; 1.914 + } 1.915 + U16_FWD_1(inputBuf, startPos, fInputLength); 1.916 + } 1.917 + } else { 1.918 + if (fLastMatchEnd >= 0) { 1.919 + // A previous find() failed to match. Don't try again. 1.920 + // (without this test, a pattern with a zero-length match 1.921 + // could match again at the end of an input string.) 1.922 + fHitEnd = TRUE; 1.923 + return FALSE; 1.924 + } 1.925 + } 1.926 + 1.927 + 1.928 + // Compute the position in the input string beyond which a match can not begin, because 1.929 + // the minimum length match would extend past the end of the input. 1.930 + // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 1.931 + // Be aware of possible overflows if making changes here. 1.932 + int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 1.933 + if (startPos > testLen) { 1.934 + fMatch = FALSE; 1.935 + fHitEnd = TRUE; 1.936 + return FALSE; 1.937 + } 1.938 + 1.939 + UChar32 c; 1.940 + U_ASSERT(startPos >= 0); 1.941 + 1.942 + switch (fPattern->fStartType) { 1.943 + case START_NO_INFO: 1.944 + // No optimization was found. 1.945 + // Try a match at each input position. 1.946 + for (;;) { 1.947 + MatchChunkAt(startPos, FALSE, fDeferredStatus); 1.948 + if (U_FAILURE(fDeferredStatus)) { 1.949 + return FALSE; 1.950 + } 1.951 + if (fMatch) { 1.952 + return TRUE; 1.953 + } 1.954 + if (startPos >= testLen) { 1.955 + fHitEnd = TRUE; 1.956 + return FALSE; 1.957 + } 1.958 + U16_FWD_1(inputBuf, startPos, fActiveLimit); 1.959 + // Note that it's perfectly OK for a pattern to have a zero-length 1.960 + // match at the end of a string, so we must make sure that the loop 1.961 + // runs with startPos == testLen the last time through. 1.962 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.963 + return FALSE; 1.964 + } 1.965 + U_ASSERT(FALSE); 1.966 + 1.967 + case START_START: 1.968 + // Matches are only possible at the start of the input string 1.969 + // (pattern begins with ^ or \A) 1.970 + if (startPos > fActiveStart) { 1.971 + fMatch = FALSE; 1.972 + return FALSE; 1.973 + } 1.974 + MatchChunkAt(startPos, FALSE, fDeferredStatus); 1.975 + if (U_FAILURE(fDeferredStatus)) { 1.976 + return FALSE; 1.977 + } 1.978 + return fMatch; 1.979 + 1.980 + 1.981 + case START_SET: 1.982 + { 1.983 + // Match may start on any char from a pre-computed set. 1.984 + U_ASSERT(fPattern->fMinMatchLen > 0); 1.985 + for (;;) { 1.986 + int32_t pos = startPos; 1.987 + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1.988 + if ((c<256 && fPattern->fInitialChars8->contains(c)) || 1.989 + (c>=256 && fPattern->fInitialChars->contains(c))) { 1.990 + MatchChunkAt(pos, FALSE, fDeferredStatus); 1.991 + if (U_FAILURE(fDeferredStatus)) { 1.992 + return FALSE; 1.993 + } 1.994 + if (fMatch) { 1.995 + return TRUE; 1.996 + } 1.997 + } 1.998 + if (pos >= testLen) { 1.999 + fMatch = FALSE; 1.1000 + fHitEnd = TRUE; 1.1001 + return FALSE; 1.1002 + } 1.1003 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.1004 + return FALSE; 1.1005 + } 1.1006 + } 1.1007 + U_ASSERT(FALSE); 1.1008 + 1.1009 + case START_STRING: 1.1010 + case START_CHAR: 1.1011 + { 1.1012 + // Match starts on exactly one char. 1.1013 + U_ASSERT(fPattern->fMinMatchLen > 0); 1.1014 + UChar32 theChar = fPattern->fInitialChar; 1.1015 + for (;;) { 1.1016 + int32_t pos = startPos; 1.1017 + U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1.1018 + if (c == theChar) { 1.1019 + MatchChunkAt(pos, FALSE, fDeferredStatus); 1.1020 + if (U_FAILURE(fDeferredStatus)) { 1.1021 + return FALSE; 1.1022 + } 1.1023 + if (fMatch) { 1.1024 + return TRUE; 1.1025 + } 1.1026 + } 1.1027 + if (pos >= testLen) { 1.1028 + fMatch = FALSE; 1.1029 + fHitEnd = TRUE; 1.1030 + return FALSE; 1.1031 + } 1.1032 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.1033 + return FALSE; 1.1034 + } 1.1035 + } 1.1036 + U_ASSERT(FALSE); 1.1037 + 1.1038 + case START_LINE: 1.1039 + { 1.1040 + UChar32 c; 1.1041 + if (startPos == fAnchorStart) { 1.1042 + MatchChunkAt(startPos, FALSE, fDeferredStatus); 1.1043 + if (U_FAILURE(fDeferredStatus)) { 1.1044 + return FALSE; 1.1045 + } 1.1046 + if (fMatch) { 1.1047 + return TRUE; 1.1048 + } 1.1049 + U16_FWD_1(inputBuf, startPos, fActiveLimit); 1.1050 + } 1.1051 + 1.1052 + if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1.1053 + for (;;) { 1.1054 + c = inputBuf[startPos-1]; 1.1055 + if (c == 0x0a) { 1.1056 + MatchChunkAt(startPos, FALSE, fDeferredStatus); 1.1057 + if (U_FAILURE(fDeferredStatus)) { 1.1058 + return FALSE; 1.1059 + } 1.1060 + if (fMatch) { 1.1061 + return TRUE; 1.1062 + } 1.1063 + } 1.1064 + if (startPos >= testLen) { 1.1065 + fMatch = FALSE; 1.1066 + fHitEnd = TRUE; 1.1067 + return FALSE; 1.1068 + } 1.1069 + U16_FWD_1(inputBuf, startPos, fActiveLimit); 1.1070 + // Note that it's perfectly OK for a pattern to have a zero-length 1.1071 + // match at the end of a string, so we must make sure that the loop 1.1072 + // runs with startPos == testLen the last time through. 1.1073 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.1074 + return FALSE; 1.1075 + } 1.1076 + } else { 1.1077 + for (;;) { 1.1078 + c = inputBuf[startPos-1]; 1.1079 + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1.1080 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1.1081 + if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1.1082 + startPos++; 1.1083 + } 1.1084 + MatchChunkAt(startPos, FALSE, fDeferredStatus); 1.1085 + if (U_FAILURE(fDeferredStatus)) { 1.1086 + return FALSE; 1.1087 + } 1.1088 + if (fMatch) { 1.1089 + return TRUE; 1.1090 + } 1.1091 + } 1.1092 + if (startPos >= testLen) { 1.1093 + fMatch = FALSE; 1.1094 + fHitEnd = TRUE; 1.1095 + return FALSE; 1.1096 + } 1.1097 + U16_FWD_1(inputBuf, startPos, fActiveLimit); 1.1098 + // Note that it's perfectly OK for a pattern to have a zero-length 1.1099 + // match at the end of a string, so we must make sure that the loop 1.1100 + // runs with startPos == testLen the last time through. 1.1101 + if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1.1102 + return FALSE; 1.1103 + } 1.1104 + } 1.1105 + } 1.1106 + 1.1107 + default: 1.1108 + U_ASSERT(FALSE); 1.1109 + } 1.1110 + 1.1111 + U_ASSERT(FALSE); 1.1112 + return FALSE; 1.1113 +} 1.1114 + 1.1115 + 1.1116 + 1.1117 +//-------------------------------------------------------------------------------- 1.1118 +// 1.1119 +// group() 1.1120 +// 1.1121 +//-------------------------------------------------------------------------------- 1.1122 +UnicodeString RegexMatcher::group(UErrorCode &status) const { 1.1123 + return group(0, status); 1.1124 +} 1.1125 + 1.1126 +// Return immutable shallow clone 1.1127 +UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1.1128 + return group(0, dest, group_len, status); 1.1129 +} 1.1130 + 1.1131 +// Return immutable shallow clone 1.1132 +UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1.1133 + group_len = 0; 1.1134 + UBool bailOut = FALSE; 1.1135 + if (U_FAILURE(status)) { 1.1136 + return dest; 1.1137 + } 1.1138 + if (U_FAILURE(fDeferredStatus)) { 1.1139 + status = fDeferredStatus; 1.1140 + bailOut = TRUE; 1.1141 + } 1.1142 + if (fMatch == FALSE) { 1.1143 + status = U_REGEX_INVALID_STATE; 1.1144 + bailOut = TRUE; 1.1145 + } 1.1146 + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1.1147 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1148 + bailOut = TRUE; 1.1149 + } 1.1150 + 1.1151 + if (bailOut) { 1.1152 + return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); 1.1153 + } 1.1154 + 1.1155 + int64_t s, e; 1.1156 + if (groupNum == 0) { 1.1157 + s = fMatchStart; 1.1158 + e = fMatchEnd; 1.1159 + } else { 1.1160 + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1.1161 + U_ASSERT(groupOffset < fPattern->fFrameSize); 1.1162 + U_ASSERT(groupOffset >= 0); 1.1163 + s = fFrame->fExtra[groupOffset]; 1.1164 + e = fFrame->fExtra[groupOffset+1]; 1.1165 + } 1.1166 + 1.1167 + if (s < 0) { 1.1168 + // A capture group wasn't part of the match 1.1169 + return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1.1170 + } 1.1171 + U_ASSERT(s <= e); 1.1172 + group_len = e - s; 1.1173 + 1.1174 + dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1.1175 + if (dest) 1.1176 + UTEXT_SETNATIVEINDEX(dest, s); 1.1177 + return dest; 1.1178 +} 1.1179 + 1.1180 +UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1.1181 + UnicodeString result; 1.1182 + if (U_FAILURE(status)) { 1.1183 + return result; 1.1184 + } 1.1185 + UText resultText = UTEXT_INITIALIZER; 1.1186 + utext_openUnicodeString(&resultText, &result, &status); 1.1187 + group(groupNum, &resultText, status); 1.1188 + utext_close(&resultText); 1.1189 + return result; 1.1190 +} 1.1191 + 1.1192 + 1.1193 +// Return deep (mutable) clone 1.1194 +// Technology Preview (as an API), but note that the UnicodeString API is implemented 1.1195 +// using this function. 1.1196 +UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1.1197 + UBool bailOut = FALSE; 1.1198 + if (U_FAILURE(status)) { 1.1199 + return dest; 1.1200 + } 1.1201 + if (U_FAILURE(fDeferredStatus)) { 1.1202 + status = fDeferredStatus; 1.1203 + bailOut = TRUE; 1.1204 + } 1.1205 + 1.1206 + if (fMatch == FALSE) { 1.1207 + status = U_REGEX_INVALID_STATE; 1.1208 + bailOut = TRUE; 1.1209 + } 1.1210 + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1.1211 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1212 + bailOut = TRUE; 1.1213 + } 1.1214 + 1.1215 + if (bailOut) { 1.1216 + if (dest) { 1.1217 + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1.1218 + return dest; 1.1219 + } else { 1.1220 + return utext_openUChars(NULL, NULL, 0, &status); 1.1221 + } 1.1222 + } 1.1223 + 1.1224 + int64_t s, e; 1.1225 + if (groupNum == 0) { 1.1226 + s = fMatchStart; 1.1227 + e = fMatchEnd; 1.1228 + } else { 1.1229 + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1.1230 + U_ASSERT(groupOffset < fPattern->fFrameSize); 1.1231 + U_ASSERT(groupOffset >= 0); 1.1232 + s = fFrame->fExtra[groupOffset]; 1.1233 + e = fFrame->fExtra[groupOffset+1]; 1.1234 + } 1.1235 + 1.1236 + if (s < 0) { 1.1237 + // A capture group wasn't part of the match 1.1238 + if (dest) { 1.1239 + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1.1240 + return dest; 1.1241 + } else { 1.1242 + return utext_openUChars(NULL, NULL, 0, &status); 1.1243 + } 1.1244 + } 1.1245 + U_ASSERT(s <= e); 1.1246 + 1.1247 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1248 + U_ASSERT(e <= fInputLength); 1.1249 + if (dest) { 1.1250 + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1.1251 + } else { 1.1252 + UText groupText = UTEXT_INITIALIZER; 1.1253 + utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1.1254 + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1.1255 + utext_close(&groupText); 1.1256 + } 1.1257 + } else { 1.1258 + int32_t len16; 1.1259 + if (UTEXT_USES_U16(fInputText)) { 1.1260 + len16 = (int32_t)(e-s); 1.1261 + } else { 1.1262 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.1263 + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1.1264 + } 1.1265 + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1.1266 + if (groupChars == NULL) { 1.1267 + status = U_MEMORY_ALLOCATION_ERROR; 1.1268 + return dest; 1.1269 + } 1.1270 + utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1.1271 + 1.1272 + if (dest) { 1.1273 + utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1.1274 + } else { 1.1275 + UText groupText = UTEXT_INITIALIZER; 1.1276 + utext_openUChars(&groupText, groupChars, len16, &status); 1.1277 + dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1.1278 + utext_close(&groupText); 1.1279 + } 1.1280 + 1.1281 + uprv_free(groupChars); 1.1282 + } 1.1283 + return dest; 1.1284 +} 1.1285 + 1.1286 +//-------------------------------------------------------------------------------- 1.1287 +// 1.1288 +// appendGroup() -- currently internal only, appends a group to a UText rather 1.1289 +// than replacing its contents 1.1290 +// 1.1291 +//-------------------------------------------------------------------------------- 1.1292 + 1.1293 +int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1.1294 + if (U_FAILURE(status)) { 1.1295 + return 0; 1.1296 + } 1.1297 + if (U_FAILURE(fDeferredStatus)) { 1.1298 + status = fDeferredStatus; 1.1299 + return 0; 1.1300 + } 1.1301 + int64_t destLen = utext_nativeLength(dest); 1.1302 + 1.1303 + if (fMatch == FALSE) { 1.1304 + status = U_REGEX_INVALID_STATE; 1.1305 + return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1.1306 + } 1.1307 + if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1.1308 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1309 + return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1.1310 + } 1.1311 + 1.1312 + int64_t s, e; 1.1313 + if (groupNum == 0) { 1.1314 + s = fMatchStart; 1.1315 + e = fMatchEnd; 1.1316 + } else { 1.1317 + int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1.1318 + U_ASSERT(groupOffset < fPattern->fFrameSize); 1.1319 + U_ASSERT(groupOffset >= 0); 1.1320 + s = fFrame->fExtra[groupOffset]; 1.1321 + e = fFrame->fExtra[groupOffset+1]; 1.1322 + } 1.1323 + 1.1324 + if (s < 0) { 1.1325 + // A capture group wasn't part of the match 1.1326 + return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1.1327 + } 1.1328 + U_ASSERT(s <= e); 1.1329 + 1.1330 + int64_t deltaLen; 1.1331 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1332 + U_ASSERT(e <= fInputLength); 1.1333 + deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1.1334 + } else { 1.1335 + int32_t len16; 1.1336 + if (UTEXT_USES_U16(fInputText)) { 1.1337 + len16 = (int32_t)(e-s); 1.1338 + } else { 1.1339 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.1340 + len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1.1341 + } 1.1342 + UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1.1343 + if (groupChars == NULL) { 1.1344 + status = U_MEMORY_ALLOCATION_ERROR; 1.1345 + return 0; 1.1346 + } 1.1347 + utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1.1348 + 1.1349 + deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1.1350 + uprv_free(groupChars); 1.1351 + } 1.1352 + return deltaLen; 1.1353 +} 1.1354 + 1.1355 + 1.1356 + 1.1357 +//-------------------------------------------------------------------------------- 1.1358 +// 1.1359 +// groupCount() 1.1360 +// 1.1361 +//-------------------------------------------------------------------------------- 1.1362 +int32_t RegexMatcher::groupCount() const { 1.1363 + return fPattern->fGroupMap->size(); 1.1364 +} 1.1365 + 1.1366 + 1.1367 + 1.1368 +//-------------------------------------------------------------------------------- 1.1369 +// 1.1370 +// hasAnchoringBounds() 1.1371 +// 1.1372 +//-------------------------------------------------------------------------------- 1.1373 +UBool RegexMatcher::hasAnchoringBounds() const { 1.1374 + return fAnchoringBounds; 1.1375 +} 1.1376 + 1.1377 + 1.1378 +//-------------------------------------------------------------------------------- 1.1379 +// 1.1380 +// hasTransparentBounds() 1.1381 +// 1.1382 +//-------------------------------------------------------------------------------- 1.1383 +UBool RegexMatcher::hasTransparentBounds() const { 1.1384 + return fTransparentBounds; 1.1385 +} 1.1386 + 1.1387 + 1.1388 + 1.1389 +//-------------------------------------------------------------------------------- 1.1390 +// 1.1391 +// hitEnd() 1.1392 +// 1.1393 +//-------------------------------------------------------------------------------- 1.1394 +UBool RegexMatcher::hitEnd() const { 1.1395 + return fHitEnd; 1.1396 +} 1.1397 + 1.1398 + 1.1399 +//-------------------------------------------------------------------------------- 1.1400 +// 1.1401 +// input() 1.1402 +// 1.1403 +//-------------------------------------------------------------------------------- 1.1404 +const UnicodeString &RegexMatcher::input() const { 1.1405 + if (!fInput) { 1.1406 + UErrorCode status = U_ZERO_ERROR; 1.1407 + int32_t len16; 1.1408 + if (UTEXT_USES_U16(fInputText)) { 1.1409 + len16 = (int32_t)fInputLength; 1.1410 + } else { 1.1411 + len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1.1412 + status = U_ZERO_ERROR; // overflow, length status 1.1413 + } 1.1414 + UnicodeString *result = new UnicodeString(len16, 0, 0); 1.1415 + 1.1416 + UChar *inputChars = result->getBuffer(len16); 1.1417 + utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1.1418 + result->releaseBuffer(len16); 1.1419 + 1.1420 + (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1.1421 + } 1.1422 + 1.1423 + return *fInput; 1.1424 +} 1.1425 + 1.1426 +//-------------------------------------------------------------------------------- 1.1427 +// 1.1428 +// inputText() 1.1429 +// 1.1430 +//-------------------------------------------------------------------------------- 1.1431 +UText *RegexMatcher::inputText() const { 1.1432 + return fInputText; 1.1433 +} 1.1434 + 1.1435 + 1.1436 +//-------------------------------------------------------------------------------- 1.1437 +// 1.1438 +// getInput() -- like inputText(), but makes a clone or copies into another UText 1.1439 +// 1.1440 +//-------------------------------------------------------------------------------- 1.1441 +UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1.1442 + UBool bailOut = FALSE; 1.1443 + if (U_FAILURE(status)) { 1.1444 + return dest; 1.1445 + } 1.1446 + if (U_FAILURE(fDeferredStatus)) { 1.1447 + status = fDeferredStatus; 1.1448 + bailOut = TRUE; 1.1449 + } 1.1450 + 1.1451 + if (bailOut) { 1.1452 + if (dest) { 1.1453 + utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1.1454 + return dest; 1.1455 + } else { 1.1456 + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1.1457 + } 1.1458 + } 1.1459 + 1.1460 + if (dest) { 1.1461 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1462 + utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1.1463 + } else { 1.1464 + int32_t input16Len; 1.1465 + if (UTEXT_USES_U16(fInputText)) { 1.1466 + input16Len = (int32_t)fInputLength; 1.1467 + } else { 1.1468 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.1469 + input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1.1470 + } 1.1471 + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1.1472 + if (inputChars == NULL) { 1.1473 + return dest; 1.1474 + } 1.1475 + 1.1476 + status = U_ZERO_ERROR; 1.1477 + utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1.1478 + status = U_ZERO_ERROR; 1.1479 + utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1.1480 + 1.1481 + uprv_free(inputChars); 1.1482 + } 1.1483 + return dest; 1.1484 + } else { 1.1485 + return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1.1486 + } 1.1487 +} 1.1488 + 1.1489 + 1.1490 +static UBool compat_SyncMutableUTextContents(UText *ut); 1.1491 +static UBool compat_SyncMutableUTextContents(UText *ut) { 1.1492 + UBool retVal = FALSE; 1.1493 + 1.1494 + // In the following test, we're really only interested in whether the UText should switch 1.1495 + // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1.1496 + // will still point to the correct data. 1.1497 + if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1.1498 + UnicodeString *us=(UnicodeString *)ut->context; 1.1499 + 1.1500 + // Update to the latest length. 1.1501 + // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1.1502 + int32_t newLength = us->length(); 1.1503 + 1.1504 + // Update the chunk description. 1.1505 + // The buffer may have switched between stack- and heap-based. 1.1506 + ut->chunkContents = us->getBuffer(); 1.1507 + ut->chunkLength = newLength; 1.1508 + ut->chunkNativeLimit = newLength; 1.1509 + ut->nativeIndexingLimit = newLength; 1.1510 + retVal = TRUE; 1.1511 + } 1.1512 + 1.1513 + return retVal; 1.1514 +} 1.1515 + 1.1516 +//-------------------------------------------------------------------------------- 1.1517 +// 1.1518 +// lookingAt() 1.1519 +// 1.1520 +//-------------------------------------------------------------------------------- 1.1521 +UBool RegexMatcher::lookingAt(UErrorCode &status) { 1.1522 + if (U_FAILURE(status)) { 1.1523 + return FALSE; 1.1524 + } 1.1525 + if (U_FAILURE(fDeferredStatus)) { 1.1526 + status = fDeferredStatus; 1.1527 + return FALSE; 1.1528 + } 1.1529 + 1.1530 + if (fInputUniStrMaybeMutable) { 1.1531 + if (compat_SyncMutableUTextContents(fInputText)) { 1.1532 + fInputLength = utext_nativeLength(fInputText); 1.1533 + reset(); 1.1534 + } 1.1535 + } 1.1536 + else { 1.1537 + resetPreserveRegion(); 1.1538 + } 1.1539 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1540 + MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1.1541 + } else { 1.1542 + MatchAt(fActiveStart, FALSE, status); 1.1543 + } 1.1544 + return fMatch; 1.1545 +} 1.1546 + 1.1547 + 1.1548 +UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1.1549 + if (U_FAILURE(status)) { 1.1550 + return FALSE; 1.1551 + } 1.1552 + if (U_FAILURE(fDeferredStatus)) { 1.1553 + status = fDeferredStatus; 1.1554 + return FALSE; 1.1555 + } 1.1556 + reset(); 1.1557 + 1.1558 + if (start < 0) { 1.1559 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1560 + return FALSE; 1.1561 + } 1.1562 + 1.1563 + if (fInputUniStrMaybeMutable) { 1.1564 + if (compat_SyncMutableUTextContents(fInputText)) { 1.1565 + fInputLength = utext_nativeLength(fInputText); 1.1566 + reset(); 1.1567 + } 1.1568 + } 1.1569 + 1.1570 + int64_t nativeStart; 1.1571 + nativeStart = start; 1.1572 + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1.1573 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1574 + return FALSE; 1.1575 + } 1.1576 + 1.1577 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1578 + MatchChunkAt((int32_t)nativeStart, FALSE, status); 1.1579 + } else { 1.1580 + MatchAt(nativeStart, FALSE, status); 1.1581 + } 1.1582 + return fMatch; 1.1583 +} 1.1584 + 1.1585 + 1.1586 + 1.1587 +//-------------------------------------------------------------------------------- 1.1588 +// 1.1589 +// matches() 1.1590 +// 1.1591 +//-------------------------------------------------------------------------------- 1.1592 +UBool RegexMatcher::matches(UErrorCode &status) { 1.1593 + if (U_FAILURE(status)) { 1.1594 + return FALSE; 1.1595 + } 1.1596 + if (U_FAILURE(fDeferredStatus)) { 1.1597 + status = fDeferredStatus; 1.1598 + return FALSE; 1.1599 + } 1.1600 + 1.1601 + if (fInputUniStrMaybeMutable) { 1.1602 + if (compat_SyncMutableUTextContents(fInputText)) { 1.1603 + fInputLength = utext_nativeLength(fInputText); 1.1604 + reset(); 1.1605 + } 1.1606 + } 1.1607 + else { 1.1608 + resetPreserveRegion(); 1.1609 + } 1.1610 + 1.1611 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1612 + MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1.1613 + } else { 1.1614 + MatchAt(fActiveStart, TRUE, status); 1.1615 + } 1.1616 + return fMatch; 1.1617 +} 1.1618 + 1.1619 + 1.1620 +UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1.1621 + if (U_FAILURE(status)) { 1.1622 + return FALSE; 1.1623 + } 1.1624 + if (U_FAILURE(fDeferredStatus)) { 1.1625 + status = fDeferredStatus; 1.1626 + return FALSE; 1.1627 + } 1.1628 + reset(); 1.1629 + 1.1630 + if (start < 0) { 1.1631 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1632 + return FALSE; 1.1633 + } 1.1634 + 1.1635 + if (fInputUniStrMaybeMutable) { 1.1636 + if (compat_SyncMutableUTextContents(fInputText)) { 1.1637 + fInputLength = utext_nativeLength(fInputText); 1.1638 + reset(); 1.1639 + } 1.1640 + } 1.1641 + 1.1642 + int64_t nativeStart; 1.1643 + nativeStart = start; 1.1644 + if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1.1645 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1646 + return FALSE; 1.1647 + } 1.1648 + 1.1649 + if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1.1650 + MatchChunkAt((int32_t)nativeStart, TRUE, status); 1.1651 + } else { 1.1652 + MatchAt(nativeStart, TRUE, status); 1.1653 + } 1.1654 + return fMatch; 1.1655 +} 1.1656 + 1.1657 + 1.1658 + 1.1659 +//-------------------------------------------------------------------------------- 1.1660 +// 1.1661 +// pattern 1.1662 +// 1.1663 +//-------------------------------------------------------------------------------- 1.1664 +const RegexPattern &RegexMatcher::pattern() const { 1.1665 + return *fPattern; 1.1666 +} 1.1667 + 1.1668 + 1.1669 + 1.1670 +//-------------------------------------------------------------------------------- 1.1671 +// 1.1672 +// region 1.1673 +// 1.1674 +//-------------------------------------------------------------------------------- 1.1675 +RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1.1676 + if (U_FAILURE(status)) { 1.1677 + return *this; 1.1678 + } 1.1679 + 1.1680 + if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1.1681 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.1682 + } 1.1683 + 1.1684 + int64_t nativeStart = regionStart; 1.1685 + int64_t nativeLimit = regionLimit; 1.1686 + if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1.1687 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.1688 + } 1.1689 + 1.1690 + if (startIndex == -1) 1.1691 + this->reset(); 1.1692 + else 1.1693 + resetPreserveRegion(); 1.1694 + 1.1695 + fRegionStart = nativeStart; 1.1696 + fRegionLimit = nativeLimit; 1.1697 + fActiveStart = nativeStart; 1.1698 + fActiveLimit = nativeLimit; 1.1699 + 1.1700 + if (startIndex != -1) { 1.1701 + if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1.1702 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1703 + } 1.1704 + fMatchEnd = startIndex; 1.1705 + } 1.1706 + 1.1707 + if (!fTransparentBounds) { 1.1708 + fLookStart = nativeStart; 1.1709 + fLookLimit = nativeLimit; 1.1710 + } 1.1711 + if (fAnchoringBounds) { 1.1712 + fAnchorStart = nativeStart; 1.1713 + fAnchorLimit = nativeLimit; 1.1714 + } 1.1715 + return *this; 1.1716 +} 1.1717 + 1.1718 +RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1.1719 + return region(start, limit, -1, status); 1.1720 +} 1.1721 + 1.1722 +//-------------------------------------------------------------------------------- 1.1723 +// 1.1724 +// regionEnd 1.1725 +// 1.1726 +//-------------------------------------------------------------------------------- 1.1727 +int32_t RegexMatcher::regionEnd() const { 1.1728 + return (int32_t)fRegionLimit; 1.1729 +} 1.1730 + 1.1731 +int64_t RegexMatcher::regionEnd64() const { 1.1732 + return fRegionLimit; 1.1733 +} 1.1734 + 1.1735 +//-------------------------------------------------------------------------------- 1.1736 +// 1.1737 +// regionStart 1.1738 +// 1.1739 +//-------------------------------------------------------------------------------- 1.1740 +int32_t RegexMatcher::regionStart() const { 1.1741 + return (int32_t)fRegionStart; 1.1742 +} 1.1743 + 1.1744 +int64_t RegexMatcher::regionStart64() const { 1.1745 + return fRegionStart; 1.1746 +} 1.1747 + 1.1748 + 1.1749 +//-------------------------------------------------------------------------------- 1.1750 +// 1.1751 +// replaceAll 1.1752 +// 1.1753 +//-------------------------------------------------------------------------------- 1.1754 +UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1.1755 + UText replacementText = UTEXT_INITIALIZER; 1.1756 + UText resultText = UTEXT_INITIALIZER; 1.1757 + UnicodeString resultString; 1.1758 + if (U_FAILURE(status)) { 1.1759 + return resultString; 1.1760 + } 1.1761 + 1.1762 + utext_openConstUnicodeString(&replacementText, &replacement, &status); 1.1763 + utext_openUnicodeString(&resultText, &resultString, &status); 1.1764 + 1.1765 + replaceAll(&replacementText, &resultText, status); 1.1766 + 1.1767 + utext_close(&resultText); 1.1768 + utext_close(&replacementText); 1.1769 + 1.1770 + return resultString; 1.1771 +} 1.1772 + 1.1773 + 1.1774 +// 1.1775 +// replaceAll, UText mode 1.1776 +// 1.1777 +UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1.1778 + if (U_FAILURE(status)) { 1.1779 + return dest; 1.1780 + } 1.1781 + if (U_FAILURE(fDeferredStatus)) { 1.1782 + status = fDeferredStatus; 1.1783 + return dest; 1.1784 + } 1.1785 + 1.1786 + if (dest == NULL) { 1.1787 + UnicodeString emptyString; 1.1788 + UText empty = UTEXT_INITIALIZER; 1.1789 + 1.1790 + utext_openUnicodeString(&empty, &emptyString, &status); 1.1791 + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1.1792 + utext_close(&empty); 1.1793 + } 1.1794 + 1.1795 + if (U_SUCCESS(status)) { 1.1796 + reset(); 1.1797 + while (find()) { 1.1798 + appendReplacement(dest, replacement, status); 1.1799 + if (U_FAILURE(status)) { 1.1800 + break; 1.1801 + } 1.1802 + } 1.1803 + appendTail(dest, status); 1.1804 + } 1.1805 + 1.1806 + return dest; 1.1807 +} 1.1808 + 1.1809 + 1.1810 +//-------------------------------------------------------------------------------- 1.1811 +// 1.1812 +// replaceFirst 1.1813 +// 1.1814 +//-------------------------------------------------------------------------------- 1.1815 +UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1.1816 + UText replacementText = UTEXT_INITIALIZER; 1.1817 + UText resultText = UTEXT_INITIALIZER; 1.1818 + UnicodeString resultString; 1.1819 + 1.1820 + utext_openConstUnicodeString(&replacementText, &replacement, &status); 1.1821 + utext_openUnicodeString(&resultText, &resultString, &status); 1.1822 + 1.1823 + replaceFirst(&replacementText, &resultText, status); 1.1824 + 1.1825 + utext_close(&resultText); 1.1826 + utext_close(&replacementText); 1.1827 + 1.1828 + return resultString; 1.1829 +} 1.1830 + 1.1831 +// 1.1832 +// replaceFirst, UText mode 1.1833 +// 1.1834 +UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1.1835 + if (U_FAILURE(status)) { 1.1836 + return dest; 1.1837 + } 1.1838 + if (U_FAILURE(fDeferredStatus)) { 1.1839 + status = fDeferredStatus; 1.1840 + return dest; 1.1841 + } 1.1842 + 1.1843 + reset(); 1.1844 + if (!find()) { 1.1845 + return getInput(dest, status); 1.1846 + } 1.1847 + 1.1848 + if (dest == NULL) { 1.1849 + UnicodeString emptyString; 1.1850 + UText empty = UTEXT_INITIALIZER; 1.1851 + 1.1852 + utext_openUnicodeString(&empty, &emptyString, &status); 1.1853 + dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1.1854 + utext_close(&empty); 1.1855 + } 1.1856 + 1.1857 + appendReplacement(dest, replacement, status); 1.1858 + appendTail(dest, status); 1.1859 + 1.1860 + return dest; 1.1861 +} 1.1862 + 1.1863 + 1.1864 +//-------------------------------------------------------------------------------- 1.1865 +// 1.1866 +// requireEnd 1.1867 +// 1.1868 +//-------------------------------------------------------------------------------- 1.1869 +UBool RegexMatcher::requireEnd() const { 1.1870 + return fRequireEnd; 1.1871 +} 1.1872 + 1.1873 + 1.1874 +//-------------------------------------------------------------------------------- 1.1875 +// 1.1876 +// reset 1.1877 +// 1.1878 +//-------------------------------------------------------------------------------- 1.1879 +RegexMatcher &RegexMatcher::reset() { 1.1880 + fRegionStart = 0; 1.1881 + fRegionLimit = fInputLength; 1.1882 + fActiveStart = 0; 1.1883 + fActiveLimit = fInputLength; 1.1884 + fAnchorStart = 0; 1.1885 + fAnchorLimit = fInputLength; 1.1886 + fLookStart = 0; 1.1887 + fLookLimit = fInputLength; 1.1888 + resetPreserveRegion(); 1.1889 + return *this; 1.1890 +} 1.1891 + 1.1892 + 1.1893 + 1.1894 +void RegexMatcher::resetPreserveRegion() { 1.1895 + fMatchStart = 0; 1.1896 + fMatchEnd = 0; 1.1897 + fLastMatchEnd = -1; 1.1898 + fAppendPosition = 0; 1.1899 + fMatch = FALSE; 1.1900 + fHitEnd = FALSE; 1.1901 + fRequireEnd = FALSE; 1.1902 + fTime = 0; 1.1903 + fTickCounter = TIMER_INITIAL_VALUE; 1.1904 + //resetStack(); // more expensive than it looks... 1.1905 +} 1.1906 + 1.1907 + 1.1908 +RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1.1909 + fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1.1910 + if (fPattern->fNeedsAltInput) { 1.1911 + fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1.1912 + } 1.1913 + fInputLength = utext_nativeLength(fInputText); 1.1914 + 1.1915 + reset(); 1.1916 + delete fInput; 1.1917 + fInput = NULL; 1.1918 + 1.1919 + // Do the following for any UnicodeString. 1.1920 + // This is for compatibility for those clients who modify the input string "live" during regex operations. 1.1921 + fInputUniStrMaybeMutable = TRUE; 1.1922 + 1.1923 + if (fWordBreakItr != NULL) { 1.1924 +#if UCONFIG_NO_BREAK_ITERATION==0 1.1925 + UErrorCode status = U_ZERO_ERROR; 1.1926 + fWordBreakItr->setText(fInputText, status); 1.1927 +#endif 1.1928 + } 1.1929 + return *this; 1.1930 +} 1.1931 + 1.1932 + 1.1933 +RegexMatcher &RegexMatcher::reset(UText *input) { 1.1934 + if (fInputText != input) { 1.1935 + fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1.1936 + if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1.1937 + fInputLength = utext_nativeLength(fInputText); 1.1938 + 1.1939 + delete fInput; 1.1940 + fInput = NULL; 1.1941 + 1.1942 + if (fWordBreakItr != NULL) { 1.1943 +#if UCONFIG_NO_BREAK_ITERATION==0 1.1944 + UErrorCode status = U_ZERO_ERROR; 1.1945 + fWordBreakItr->setText(input, status); 1.1946 +#endif 1.1947 + } 1.1948 + } 1.1949 + reset(); 1.1950 + fInputUniStrMaybeMutable = FALSE; 1.1951 + 1.1952 + return *this; 1.1953 +} 1.1954 + 1.1955 +/*RegexMatcher &RegexMatcher::reset(const UChar *) { 1.1956 + fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1.1957 + return *this; 1.1958 +}*/ 1.1959 + 1.1960 +RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1.1961 + if (U_FAILURE(status)) { 1.1962 + return *this; 1.1963 + } 1.1964 + reset(); // Reset also resets the region to be the entire string. 1.1965 + 1.1966 + if (position < 0 || position > fActiveLimit) { 1.1967 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.1968 + return *this; 1.1969 + } 1.1970 + fMatchEnd = position; 1.1971 + return *this; 1.1972 +} 1.1973 + 1.1974 + 1.1975 +//-------------------------------------------------------------------------------- 1.1976 +// 1.1977 +// refresh 1.1978 +// 1.1979 +//-------------------------------------------------------------------------------- 1.1980 +RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1.1981 + if (U_FAILURE(status)) { 1.1982 + return *this; 1.1983 + } 1.1984 + if (input == NULL) { 1.1985 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.1986 + return *this; 1.1987 + } 1.1988 + if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1.1989 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.1990 + return *this; 1.1991 + } 1.1992 + int64_t pos = utext_getNativeIndex(fInputText); 1.1993 + // Shallow read-only clone of the new UText into the existing input UText 1.1994 + fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1.1995 + if (U_FAILURE(status)) { 1.1996 + return *this; 1.1997 + } 1.1998 + utext_setNativeIndex(fInputText, pos); 1.1999 + 1.2000 + if (fAltInputText != NULL) { 1.2001 + pos = utext_getNativeIndex(fAltInputText); 1.2002 + fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1.2003 + if (U_FAILURE(status)) { 1.2004 + return *this; 1.2005 + } 1.2006 + utext_setNativeIndex(fAltInputText, pos); 1.2007 + } 1.2008 + return *this; 1.2009 +} 1.2010 + 1.2011 + 1.2012 + 1.2013 +//-------------------------------------------------------------------------------- 1.2014 +// 1.2015 +// setTrace 1.2016 +// 1.2017 +//-------------------------------------------------------------------------------- 1.2018 +void RegexMatcher::setTrace(UBool state) { 1.2019 + fTraceDebug = state; 1.2020 +} 1.2021 + 1.2022 + 1.2023 + 1.2024 +//--------------------------------------------------------------------- 1.2025 +// 1.2026 +// split 1.2027 +// 1.2028 +//--------------------------------------------------------------------- 1.2029 +int32_t RegexMatcher::split(const UnicodeString &input, 1.2030 + UnicodeString dest[], 1.2031 + int32_t destCapacity, 1.2032 + UErrorCode &status) 1.2033 +{ 1.2034 + UText inputText = UTEXT_INITIALIZER; 1.2035 + utext_openConstUnicodeString(&inputText, &input, &status); 1.2036 + if (U_FAILURE(status)) { 1.2037 + return 0; 1.2038 + } 1.2039 + 1.2040 + UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 1.2041 + if (destText == NULL) { 1.2042 + status = U_MEMORY_ALLOCATION_ERROR; 1.2043 + return 0; 1.2044 + } 1.2045 + int32_t i; 1.2046 + for (i = 0; i < destCapacity; i++) { 1.2047 + destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 1.2048 + } 1.2049 + 1.2050 + int32_t fieldCount = split(&inputText, destText, destCapacity, status); 1.2051 + 1.2052 + for (i = 0; i < destCapacity; i++) { 1.2053 + utext_close(destText[i]); 1.2054 + } 1.2055 + 1.2056 + uprv_free(destText); 1.2057 + utext_close(&inputText); 1.2058 + return fieldCount; 1.2059 +} 1.2060 + 1.2061 +// 1.2062 +// split, UText mode 1.2063 +// 1.2064 +int32_t RegexMatcher::split(UText *input, 1.2065 + UText *dest[], 1.2066 + int32_t destCapacity, 1.2067 + UErrorCode &status) 1.2068 +{ 1.2069 + // 1.2070 + // Check arguements for validity 1.2071 + // 1.2072 + if (U_FAILURE(status)) { 1.2073 + return 0; 1.2074 + }; 1.2075 + 1.2076 + if (destCapacity < 1) { 1.2077 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.2078 + return 0; 1.2079 + } 1.2080 + 1.2081 + // 1.2082 + // Reset for the input text 1.2083 + // 1.2084 + reset(input); 1.2085 + int64_t nextOutputStringStart = 0; 1.2086 + if (fActiveLimit == 0) { 1.2087 + return 0; 1.2088 + } 1.2089 + 1.2090 + // 1.2091 + // Loop through the input text, searching for the delimiter pattern 1.2092 + // 1.2093 + int32_t i; 1.2094 + int32_t numCaptureGroups = fPattern->fGroupMap->size(); 1.2095 + for (i=0; ; i++) { 1.2096 + if (i>=destCapacity-1) { 1.2097 + // There is one or zero output string left. 1.2098 + // Fill the last output string with whatever is left from the input, then exit the loop. 1.2099 + // ( i will be == destCapacity if we filled the output array while processing 1.2100 + // capture groups of the delimiter expression, in which case we will discard the 1.2101 + // last capture group saved in favor of the unprocessed remainder of the 1.2102 + // input string.) 1.2103 + i = destCapacity-1; 1.2104 + if (fActiveLimit > nextOutputStringStart) { 1.2105 + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 1.2106 + if (dest[i]) { 1.2107 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 1.2108 + input->chunkContents+nextOutputStringStart, 1.2109 + (int32_t)(fActiveLimit-nextOutputStringStart), &status); 1.2110 + } else { 1.2111 + UText remainingText = UTEXT_INITIALIZER; 1.2112 + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 1.2113 + fActiveLimit-nextOutputStringStart, &status); 1.2114 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2115 + utext_close(&remainingText); 1.2116 + } 1.2117 + } else { 1.2118 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.2119 + int32_t remaining16Length = 1.2120 + utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 1.2121 + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 1.2122 + if (remainingChars == NULL) { 1.2123 + status = U_MEMORY_ALLOCATION_ERROR; 1.2124 + break; 1.2125 + } 1.2126 + 1.2127 + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 1.2128 + if (dest[i]) { 1.2129 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 1.2130 + } else { 1.2131 + UText remainingText = UTEXT_INITIALIZER; 1.2132 + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 1.2133 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2134 + utext_close(&remainingText); 1.2135 + } 1.2136 + 1.2137 + uprv_free(remainingChars); 1.2138 + } 1.2139 + } 1.2140 + break; 1.2141 + } 1.2142 + if (find()) { 1.2143 + // We found another delimiter. Move everything from where we started looking 1.2144 + // up until the start of the delimiter into the next output string. 1.2145 + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 1.2146 + if (dest[i]) { 1.2147 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 1.2148 + input->chunkContents+nextOutputStringStart, 1.2149 + (int32_t)(fMatchStart-nextOutputStringStart), &status); 1.2150 + } else { 1.2151 + UText remainingText = UTEXT_INITIALIZER; 1.2152 + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 1.2153 + fMatchStart-nextOutputStringStart, &status); 1.2154 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2155 + utext_close(&remainingText); 1.2156 + } 1.2157 + } else { 1.2158 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.2159 + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 1.2160 + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 1.2161 + if (remainingChars == NULL) { 1.2162 + status = U_MEMORY_ALLOCATION_ERROR; 1.2163 + break; 1.2164 + } 1.2165 + utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 1.2166 + if (dest[i]) { 1.2167 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 1.2168 + } else { 1.2169 + UText remainingText = UTEXT_INITIALIZER; 1.2170 + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 1.2171 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2172 + utext_close(&remainingText); 1.2173 + } 1.2174 + 1.2175 + uprv_free(remainingChars); 1.2176 + } 1.2177 + nextOutputStringStart = fMatchEnd; 1.2178 + 1.2179 + // If the delimiter pattern has capturing parentheses, the captured 1.2180 + // text goes out into the next n destination strings. 1.2181 + int32_t groupNum; 1.2182 + for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1.2183 + if (i >= destCapacity-2) { 1.2184 + // Never fill the last available output string with capture group text. 1.2185 + // It will filled with the last field, the remainder of the 1.2186 + // unsplit input text. 1.2187 + break; 1.2188 + } 1.2189 + i++; 1.2190 + dest[i] = group(groupNum, dest[i], status); 1.2191 + } 1.2192 + 1.2193 + if (nextOutputStringStart == fActiveLimit) { 1.2194 + // The delimiter was at the end of the string. We're done, but first 1.2195 + // we output one last empty string, for the empty field following 1.2196 + // the delimiter at the end of input. 1.2197 + if (i+1 < destCapacity) { 1.2198 + ++i; 1.2199 + if (dest[i] == NULL) { 1.2200 + dest[i] = utext_openUChars(NULL, NULL, 0, &status); 1.2201 + } else { 1.2202 + static UChar emptyString[] = {(UChar)0}; 1.2203 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 1.2204 + } 1.2205 + } 1.2206 + break; 1.2207 + 1.2208 + } 1.2209 + } 1.2210 + else 1.2211 + { 1.2212 + // We ran off the end of the input while looking for the next delimiter. 1.2213 + // All the remaining text goes into the current output string. 1.2214 + if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 1.2215 + if (dest[i]) { 1.2216 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 1.2217 + input->chunkContents+nextOutputStringStart, 1.2218 + (int32_t)(fActiveLimit-nextOutputStringStart), &status); 1.2219 + } else { 1.2220 + UText remainingText = UTEXT_INITIALIZER; 1.2221 + utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 1.2222 + fActiveLimit-nextOutputStringStart, &status); 1.2223 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2224 + utext_close(&remainingText); 1.2225 + } 1.2226 + } else { 1.2227 + UErrorCode lengthStatus = U_ZERO_ERROR; 1.2228 + int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 1.2229 + UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 1.2230 + if (remainingChars == NULL) { 1.2231 + status = U_MEMORY_ALLOCATION_ERROR; 1.2232 + break; 1.2233 + } 1.2234 + 1.2235 + utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 1.2236 + if (dest[i]) { 1.2237 + utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 1.2238 + } else { 1.2239 + UText remainingText = UTEXT_INITIALIZER; 1.2240 + utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 1.2241 + dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 1.2242 + utext_close(&remainingText); 1.2243 + } 1.2244 + 1.2245 + uprv_free(remainingChars); 1.2246 + } 1.2247 + break; 1.2248 + } 1.2249 + if (U_FAILURE(status)) { 1.2250 + break; 1.2251 + } 1.2252 + } // end of for loop 1.2253 + return i+1; 1.2254 +} 1.2255 + 1.2256 + 1.2257 +//-------------------------------------------------------------------------------- 1.2258 +// 1.2259 +// start 1.2260 +// 1.2261 +//-------------------------------------------------------------------------------- 1.2262 +int32_t RegexMatcher::start(UErrorCode &status) const { 1.2263 + return start(0, status); 1.2264 +} 1.2265 + 1.2266 +int64_t RegexMatcher::start64(UErrorCode &status) const { 1.2267 + return start64(0, status); 1.2268 +} 1.2269 + 1.2270 +//-------------------------------------------------------------------------------- 1.2271 +// 1.2272 +// start(int32_t group, UErrorCode &status) 1.2273 +// 1.2274 +//-------------------------------------------------------------------------------- 1.2275 + 1.2276 +int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 1.2277 + if (U_FAILURE(status)) { 1.2278 + return -1; 1.2279 + } 1.2280 + if (U_FAILURE(fDeferredStatus)) { 1.2281 + status = fDeferredStatus; 1.2282 + return -1; 1.2283 + } 1.2284 + if (fMatch == FALSE) { 1.2285 + status = U_REGEX_INVALID_STATE; 1.2286 + return -1; 1.2287 + } 1.2288 + if (group < 0 || group > fPattern->fGroupMap->size()) { 1.2289 + status = U_INDEX_OUTOFBOUNDS_ERROR; 1.2290 + return -1; 1.2291 + } 1.2292 + int64_t s; 1.2293 + if (group == 0) { 1.2294 + s = fMatchStart; 1.2295 + } else { 1.2296 + int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 1.2297 + U_ASSERT(groupOffset < fPattern->fFrameSize); 1.2298 + U_ASSERT(groupOffset >= 0); 1.2299 + s = fFrame->fExtra[groupOffset]; 1.2300 + } 1.2301 + 1.2302 + return s; 1.2303 +} 1.2304 + 1.2305 + 1.2306 +int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 1.2307 + return (int32_t)start64(group, status); 1.2308 +} 1.2309 + 1.2310 +//-------------------------------------------------------------------------------- 1.2311 +// 1.2312 +// useAnchoringBounds 1.2313 +// 1.2314 +//-------------------------------------------------------------------------------- 1.2315 +RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 1.2316 + fAnchoringBounds = b; 1.2317 + fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 1.2318 + fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 1.2319 + return *this; 1.2320 +} 1.2321 + 1.2322 + 1.2323 +//-------------------------------------------------------------------------------- 1.2324 +// 1.2325 +// useTransparentBounds 1.2326 +// 1.2327 +//-------------------------------------------------------------------------------- 1.2328 +RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 1.2329 + fTransparentBounds = b; 1.2330 + fLookStart = (fTransparentBounds ? 0 : fRegionStart); 1.2331 + fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 1.2332 + return *this; 1.2333 +} 1.2334 + 1.2335 +//-------------------------------------------------------------------------------- 1.2336 +// 1.2337 +// setTimeLimit 1.2338 +// 1.2339 +//-------------------------------------------------------------------------------- 1.2340 +void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 1.2341 + if (U_FAILURE(status)) { 1.2342 + return; 1.2343 + } 1.2344 + if (U_FAILURE(fDeferredStatus)) { 1.2345 + status = fDeferredStatus; 1.2346 + return; 1.2347 + } 1.2348 + if (limit < 0) { 1.2349 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.2350 + return; 1.2351 + } 1.2352 + fTimeLimit = limit; 1.2353 +} 1.2354 + 1.2355 + 1.2356 +//-------------------------------------------------------------------------------- 1.2357 +// 1.2358 +// getTimeLimit 1.2359 +// 1.2360 +//-------------------------------------------------------------------------------- 1.2361 +int32_t RegexMatcher::getTimeLimit() const { 1.2362 + return fTimeLimit; 1.2363 +} 1.2364 + 1.2365 + 1.2366 +//-------------------------------------------------------------------------------- 1.2367 +// 1.2368 +// setStackLimit 1.2369 +// 1.2370 +//-------------------------------------------------------------------------------- 1.2371 +void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 1.2372 + if (U_FAILURE(status)) { 1.2373 + return; 1.2374 + } 1.2375 + if (U_FAILURE(fDeferredStatus)) { 1.2376 + status = fDeferredStatus; 1.2377 + return; 1.2378 + } 1.2379 + if (limit < 0) { 1.2380 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.2381 + return; 1.2382 + } 1.2383 + 1.2384 + // Reset the matcher. This is needed here in case there is a current match 1.2385 + // whose final stack frame (containing the match results, pointed to by fFrame) 1.2386 + // would be lost by resizing to a smaller stack size. 1.2387 + reset(); 1.2388 + 1.2389 + if (limit == 0) { 1.2390 + // Unlimited stack expansion 1.2391 + fStack->setMaxCapacity(0); 1.2392 + } else { 1.2393 + // Change the units of the limit from bytes to ints, and bump the size up 1.2394 + // to be big enough to hold at least one stack frame for the pattern, 1.2395 + // if it isn't there already. 1.2396 + int32_t adjustedLimit = limit / sizeof(int32_t); 1.2397 + if (adjustedLimit < fPattern->fFrameSize) { 1.2398 + adjustedLimit = fPattern->fFrameSize; 1.2399 + } 1.2400 + fStack->setMaxCapacity(adjustedLimit); 1.2401 + } 1.2402 + fStackLimit = limit; 1.2403 +} 1.2404 + 1.2405 + 1.2406 +//-------------------------------------------------------------------------------- 1.2407 +// 1.2408 +// getStackLimit 1.2409 +// 1.2410 +//-------------------------------------------------------------------------------- 1.2411 +int32_t RegexMatcher::getStackLimit() const { 1.2412 + return fStackLimit; 1.2413 +} 1.2414 + 1.2415 + 1.2416 +//-------------------------------------------------------------------------------- 1.2417 +// 1.2418 +// setMatchCallback 1.2419 +// 1.2420 +//-------------------------------------------------------------------------------- 1.2421 +void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 1.2422 + const void *context, 1.2423 + UErrorCode &status) { 1.2424 + if (U_FAILURE(status)) { 1.2425 + return; 1.2426 + } 1.2427 + fCallbackFn = callback; 1.2428 + fCallbackContext = context; 1.2429 +} 1.2430 + 1.2431 + 1.2432 +//-------------------------------------------------------------------------------- 1.2433 +// 1.2434 +// getMatchCallback 1.2435 +// 1.2436 +//-------------------------------------------------------------------------------- 1.2437 +void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 1.2438 + const void *&context, 1.2439 + UErrorCode &status) { 1.2440 + if (U_FAILURE(status)) { 1.2441 + return; 1.2442 + } 1.2443 + callback = fCallbackFn; 1.2444 + context = fCallbackContext; 1.2445 +} 1.2446 + 1.2447 + 1.2448 +//-------------------------------------------------------------------------------- 1.2449 +// 1.2450 +// setMatchCallback 1.2451 +// 1.2452 +//-------------------------------------------------------------------------------- 1.2453 +void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 1.2454 + const void *context, 1.2455 + UErrorCode &status) { 1.2456 + if (U_FAILURE(status)) { 1.2457 + return; 1.2458 + } 1.2459 + fFindProgressCallbackFn = callback; 1.2460 + fFindProgressCallbackContext = context; 1.2461 +} 1.2462 + 1.2463 + 1.2464 +//-------------------------------------------------------------------------------- 1.2465 +// 1.2466 +// getMatchCallback 1.2467 +// 1.2468 +//-------------------------------------------------------------------------------- 1.2469 +void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 1.2470 + const void *&context, 1.2471 + UErrorCode &status) { 1.2472 + if (U_FAILURE(status)) { 1.2473 + return; 1.2474 + } 1.2475 + callback = fFindProgressCallbackFn; 1.2476 + context = fFindProgressCallbackContext; 1.2477 +} 1.2478 + 1.2479 + 1.2480 +//================================================================================ 1.2481 +// 1.2482 +// Code following this point in this file is the internal 1.2483 +// Match Engine Implementation. 1.2484 +// 1.2485 +//================================================================================ 1.2486 + 1.2487 + 1.2488 +//-------------------------------------------------------------------------------- 1.2489 +// 1.2490 +// resetStack 1.2491 +// Discard any previous contents of the state save stack, and initialize a 1.2492 +// new stack frame to all -1. The -1s are needed for capture group limits, 1.2493 +// where they indicate that a group has not yet matched anything. 1.2494 +//-------------------------------------------------------------------------------- 1.2495 +REStackFrame *RegexMatcher::resetStack() { 1.2496 + // Discard any previous contents of the state save stack, and initialize a 1.2497 + // new stack frame with all -1 data. The -1s are needed for capture group limits, 1.2498 + // where they indicate that a group has not yet matched anything. 1.2499 + fStack->removeAllElements(); 1.2500 + 1.2501 + REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 1.2502 + int32_t i; 1.2503 + for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 1.2504 + iFrame->fExtra[i] = -1; 1.2505 + } 1.2506 + return iFrame; 1.2507 +} 1.2508 + 1.2509 + 1.2510 + 1.2511 +//-------------------------------------------------------------------------------- 1.2512 +// 1.2513 +// isWordBoundary 1.2514 +// in perl, "xab..cd..", \b is true at positions 0,3,5,7 1.2515 +// For us, 1.2516 +// If the current char is a combining mark, 1.2517 +// \b is FALSE. 1.2518 +// Else Scan backwards to the first non-combining char. 1.2519 +// We are at a boundary if the this char and the original chars are 1.2520 +// opposite in membership in \w set 1.2521 +// 1.2522 +// parameters: pos - the current position in the input buffer 1.2523 +// 1.2524 +// TODO: double-check edge cases at region boundaries. 1.2525 +// 1.2526 +//-------------------------------------------------------------------------------- 1.2527 +UBool RegexMatcher::isWordBoundary(int64_t pos) { 1.2528 + UBool isBoundary = FALSE; 1.2529 + UBool cIsWord = FALSE; 1.2530 + 1.2531 + if (pos >= fLookLimit) { 1.2532 + fHitEnd = TRUE; 1.2533 + } else { 1.2534 + // Determine whether char c at current position is a member of the word set of chars. 1.2535 + // If we're off the end of the string, behave as though we're not at a word char. 1.2536 + UTEXT_SETNATIVEINDEX(fInputText, pos); 1.2537 + UChar32 c = UTEXT_CURRENT32(fInputText); 1.2538 + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 1.2539 + // Current char is a combining one. Not a boundary. 1.2540 + return FALSE; 1.2541 + } 1.2542 + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 1.2543 + } 1.2544 + 1.2545 + // Back up until we come to a non-combining char, determine whether 1.2546 + // that char is a word char. 1.2547 + UBool prevCIsWord = FALSE; 1.2548 + for (;;) { 1.2549 + if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 1.2550 + break; 1.2551 + } 1.2552 + UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 1.2553 + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 1.2554 + || u_charType(prevChar) == U_FORMAT_CHAR)) { 1.2555 + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 1.2556 + break; 1.2557 + } 1.2558 + } 1.2559 + isBoundary = cIsWord ^ prevCIsWord; 1.2560 + return isBoundary; 1.2561 +} 1.2562 + 1.2563 +UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 1.2564 + UBool isBoundary = FALSE; 1.2565 + UBool cIsWord = FALSE; 1.2566 + 1.2567 + const UChar *inputBuf = fInputText->chunkContents; 1.2568 + 1.2569 + if (pos >= fLookLimit) { 1.2570 + fHitEnd = TRUE; 1.2571 + } else { 1.2572 + // Determine whether char c at current position is a member of the word set of chars. 1.2573 + // If we're off the end of the string, behave as though we're not at a word char. 1.2574 + UChar32 c; 1.2575 + U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 1.2576 + if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 1.2577 + // Current char is a combining one. Not a boundary. 1.2578 + return FALSE; 1.2579 + } 1.2580 + cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 1.2581 + } 1.2582 + 1.2583 + // Back up until we come to a non-combining char, determine whether 1.2584 + // that char is a word char. 1.2585 + UBool prevCIsWord = FALSE; 1.2586 + for (;;) { 1.2587 + if (pos <= fLookStart) { 1.2588 + break; 1.2589 + } 1.2590 + UChar32 prevChar; 1.2591 + U16_PREV(inputBuf, fLookStart, pos, prevChar); 1.2592 + if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 1.2593 + || u_charType(prevChar) == U_FORMAT_CHAR)) { 1.2594 + prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 1.2595 + break; 1.2596 + } 1.2597 + } 1.2598 + isBoundary = cIsWord ^ prevCIsWord; 1.2599 + return isBoundary; 1.2600 +} 1.2601 + 1.2602 +//-------------------------------------------------------------------------------- 1.2603 +// 1.2604 +// isUWordBoundary 1.2605 +// 1.2606 +// Test for a word boundary using RBBI word break. 1.2607 +// 1.2608 +// parameters: pos - the current position in the input buffer 1.2609 +// 1.2610 +//-------------------------------------------------------------------------------- 1.2611 +UBool RegexMatcher::isUWordBoundary(int64_t pos) { 1.2612 + UBool returnVal = FALSE; 1.2613 +#if UCONFIG_NO_BREAK_ITERATION==0 1.2614 + 1.2615 + // If we haven't yet created a break iterator for this matcher, do it now. 1.2616 + if (fWordBreakItr == NULL) { 1.2617 + fWordBreakItr = 1.2618 + (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 1.2619 + if (U_FAILURE(fDeferredStatus)) { 1.2620 + return FALSE; 1.2621 + } 1.2622 + fWordBreakItr->setText(fInputText, fDeferredStatus); 1.2623 + } 1.2624 + 1.2625 + if (pos >= fLookLimit) { 1.2626 + fHitEnd = TRUE; 1.2627 + returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 1.2628 + // words are not boundaries. All non-word chars stand by themselves, 1.2629 + // with word boundaries on both sides. 1.2630 + } else { 1.2631 + if (!UTEXT_USES_U16(fInputText)) { 1.2632 + // !!!: Would like a better way to do this! 1.2633 + UErrorCode status = U_ZERO_ERROR; 1.2634 + pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 1.2635 + } 1.2636 + returnVal = fWordBreakItr->isBoundary((int32_t)pos); 1.2637 + } 1.2638 +#endif 1.2639 + return returnVal; 1.2640 +} 1.2641 + 1.2642 +//-------------------------------------------------------------------------------- 1.2643 +// 1.2644 +// IncrementTime This function is called once each TIMER_INITIAL_VALUE state 1.2645 +// saves. Increment the "time" counter, and call the 1.2646 +// user callback function if there is one installed. 1.2647 +// 1.2648 +// If the match operation needs to be aborted, either for a time-out 1.2649 +// or because the user callback asked for it, just set an error status. 1.2650 +// The engine will pick that up and stop in its outer loop. 1.2651 +// 1.2652 +//-------------------------------------------------------------------------------- 1.2653 +void RegexMatcher::IncrementTime(UErrorCode &status) { 1.2654 + fTickCounter = TIMER_INITIAL_VALUE; 1.2655 + fTime++; 1.2656 + if (fCallbackFn != NULL) { 1.2657 + if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 1.2658 + status = U_REGEX_STOPPED_BY_CALLER; 1.2659 + return; 1.2660 + } 1.2661 + } 1.2662 + if (fTimeLimit > 0 && fTime >= fTimeLimit) { 1.2663 + status = U_REGEX_TIME_OUT; 1.2664 + } 1.2665 +} 1.2666 + 1.2667 +//-------------------------------------------------------------------------------- 1.2668 +// 1.2669 +// ReportFindProgress This function is called once for each advance in the target 1.2670 +// string from the find() function, and calls the user progress callback 1.2671 +// function if there is one installed. 1.2672 +// 1.2673 +// NOTE: 1.2674 +// 1.2675 +// If the match operation needs to be aborted because the user 1.2676 +// callback asked for it, just set an error status. 1.2677 +// The engine will pick that up and stop in its outer loop. 1.2678 +// 1.2679 +//-------------------------------------------------------------------------------- 1.2680 +UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 1.2681 + if (fFindProgressCallbackFn != NULL) { 1.2682 + if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 1.2683 + status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 1.2684 + return FALSE; 1.2685 + } 1.2686 + } 1.2687 + return TRUE; 1.2688 +} 1.2689 + 1.2690 +//-------------------------------------------------------------------------------- 1.2691 +// 1.2692 +// StateSave 1.2693 +// Make a new stack frame, initialized as a copy of the current stack frame. 1.2694 +// Set the pattern index in the original stack frame from the operand value 1.2695 +// in the opcode. Execution of the engine continues with the state in 1.2696 +// the newly created stack frame 1.2697 +// 1.2698 +// Note that reserveBlock() may grow the stack, resulting in the 1.2699 +// whole thing being relocated in memory. 1.2700 +// 1.2701 +// Parameters: 1.2702 +// fp The top frame pointer when called. At return, a new 1.2703 +// fame will be present 1.2704 +// savePatIdx An index into the compiled pattern. Goes into the original 1.2705 +// (not new) frame. If execution ever back-tracks out of the 1.2706 +// new frame, this will be where we continue from in the pattern. 1.2707 +// Return 1.2708 +// The new frame pointer. 1.2709 +// 1.2710 +//-------------------------------------------------------------------------------- 1.2711 +inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 1.2712 + // push storage for a new frame. 1.2713 + int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 1.2714 + if (newFP == NULL) { 1.2715 + // Failure on attempted stack expansion. 1.2716 + // Stack function set some other error code, change it to a more 1.2717 + // specific one for regular expressions. 1.2718 + status = U_REGEX_STACK_OVERFLOW; 1.2719 + // We need to return a writable stack frame, so just return the 1.2720 + // previous frame. The match operation will stop quickly 1.2721 + // because of the error status, after which the frame will never 1.2722 + // be looked at again. 1.2723 + return fp; 1.2724 + } 1.2725 + fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 1.2726 + 1.2727 + // New stack frame = copy of old top frame. 1.2728 + int64_t *source = (int64_t *)fp; 1.2729 + int64_t *dest = newFP; 1.2730 + for (;;) { 1.2731 + *dest++ = *source++; 1.2732 + if (source == newFP) { 1.2733 + break; 1.2734 + } 1.2735 + } 1.2736 + 1.2737 + fTickCounter--; 1.2738 + if (fTickCounter <= 0) { 1.2739 + IncrementTime(status); // Re-initializes fTickCounter 1.2740 + } 1.2741 + fp->fPatIdx = savePatIdx; 1.2742 + return (REStackFrame *)newFP; 1.2743 +} 1.2744 + 1.2745 + 1.2746 +//-------------------------------------------------------------------------------- 1.2747 +// 1.2748 +// MatchAt This is the actual matching engine. 1.2749 +// 1.2750 +// startIdx: begin matching a this index. 1.2751 +// toEnd: if true, match must extend to end of the input region 1.2752 +// 1.2753 +//-------------------------------------------------------------------------------- 1.2754 +void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 1.2755 + UBool isMatch = FALSE; // True if the we have a match. 1.2756 + 1.2757 + int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 1.2758 + 1.2759 + int32_t op; // Operation from the compiled pattern, split into 1.2760 + int32_t opType; // the opcode 1.2761 + int32_t opValue; // and the operand value. 1.2762 + 1.2763 + #ifdef REGEX_RUN_DEBUG 1.2764 + if (fTraceDebug) 1.2765 + { 1.2766 + printf("MatchAt(startIdx=%ld)\n", startIdx); 1.2767 + printf("Original Pattern: "); 1.2768 + UChar32 c = utext_next32From(fPattern->fPattern, 0); 1.2769 + while (c != U_SENTINEL) { 1.2770 + if (c<32 || c>256) { 1.2771 + c = '.'; 1.2772 + } 1.2773 + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 1.2774 + 1.2775 + c = UTEXT_NEXT32(fPattern->fPattern); 1.2776 + } 1.2777 + printf("\n"); 1.2778 + printf("Input String: "); 1.2779 + c = utext_next32From(fInputText, 0); 1.2780 + while (c != U_SENTINEL) { 1.2781 + if (c<32 || c>256) { 1.2782 + c = '.'; 1.2783 + } 1.2784 + printf("%c", c); 1.2785 + 1.2786 + c = UTEXT_NEXT32(fInputText); 1.2787 + } 1.2788 + printf("\n"); 1.2789 + printf("\n"); 1.2790 + } 1.2791 + #endif 1.2792 + 1.2793 + if (U_FAILURE(status)) { 1.2794 + return; 1.2795 + } 1.2796 + 1.2797 + // Cache frequently referenced items from the compiled pattern 1.2798 + // 1.2799 + int64_t *pat = fPattern->fCompiledPat->getBuffer(); 1.2800 + 1.2801 + const UChar *litText = fPattern->fLiteralText.getBuffer(); 1.2802 + UVector *sets = fPattern->fSets; 1.2803 + 1.2804 + fFrameSize = fPattern->fFrameSize; 1.2805 + REStackFrame *fp = resetStack(); 1.2806 + 1.2807 + fp->fPatIdx = 0; 1.2808 + fp->fInputIdx = startIdx; 1.2809 + 1.2810 + // Zero out the pattern's static data 1.2811 + int32_t i; 1.2812 + for (i = 0; i<fPattern->fDataSize; i++) { 1.2813 + fData[i] = 0; 1.2814 + } 1.2815 + 1.2816 + // 1.2817 + // Main loop for interpreting the compiled pattern. 1.2818 + // One iteration of the loop per pattern operation performed. 1.2819 + // 1.2820 + for (;;) { 1.2821 +#if 0 1.2822 + if (_heapchk() != _HEAPOK) { 1.2823 + fprintf(stderr, "Heap Trouble\n"); 1.2824 + } 1.2825 +#endif 1.2826 + 1.2827 + op = (int32_t)pat[fp->fPatIdx]; 1.2828 + opType = URX_TYPE(op); 1.2829 + opValue = URX_VAL(op); 1.2830 + #ifdef REGEX_RUN_DEBUG 1.2831 + if (fTraceDebug) { 1.2832 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.2833 + printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 1.2834 + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 1.2835 + fPattern->dumpOp(fp->fPatIdx); 1.2836 + } 1.2837 + #endif 1.2838 + fp->fPatIdx++; 1.2839 + 1.2840 + switch (opType) { 1.2841 + 1.2842 + 1.2843 + case URX_NOP: 1.2844 + break; 1.2845 + 1.2846 + 1.2847 + case URX_BACKTRACK: 1.2848 + // Force a backtrack. In some circumstances, the pattern compiler 1.2849 + // will notice that the pattern can't possibly match anything, and will 1.2850 + // emit one of these at that point. 1.2851 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.2852 + break; 1.2853 + 1.2854 + 1.2855 + case URX_ONECHAR: 1.2856 + if (fp->fInputIdx < fActiveLimit) { 1.2857 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.2858 + UChar32 c = UTEXT_NEXT32(fInputText); 1.2859 + if (c == opValue) { 1.2860 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.2861 + break; 1.2862 + } 1.2863 + } else { 1.2864 + fHitEnd = TRUE; 1.2865 + } 1.2866 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.2867 + break; 1.2868 + 1.2869 + 1.2870 + case URX_STRING: 1.2871 + { 1.2872 + // Test input against a literal string. 1.2873 + // Strings require two slots in the compiled pattern, one for the 1.2874 + // offset to the string text, and one for the length. 1.2875 + 1.2876 + int32_t stringStartIdx = opValue; 1.2877 + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 1.2878 + fp->fPatIdx++; 1.2879 + opType = URX_TYPE(op); 1.2880 + int32_t stringLen = URX_VAL(op); 1.2881 + U_ASSERT(opType == URX_STRING_LEN); 1.2882 + U_ASSERT(stringLen >= 2); 1.2883 + 1.2884 + const UChar *patternString = litText+stringStartIdx; 1.2885 + int32_t patternStringIndex = 0; 1.2886 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.2887 + UChar32 inputChar; 1.2888 + UChar32 patternChar; 1.2889 + UBool success = TRUE; 1.2890 + while (patternStringIndex < stringLen) { 1.2891 + if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 1.2892 + success = FALSE; 1.2893 + fHitEnd = TRUE; 1.2894 + break; 1.2895 + } 1.2896 + inputChar = UTEXT_NEXT32(fInputText); 1.2897 + U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 1.2898 + if (patternChar != inputChar) { 1.2899 + success = FALSE; 1.2900 + break; 1.2901 + } 1.2902 + } 1.2903 + 1.2904 + if (success) { 1.2905 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.2906 + } else { 1.2907 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.2908 + } 1.2909 + } 1.2910 + break; 1.2911 + 1.2912 + 1.2913 + case URX_STATE_SAVE: 1.2914 + fp = StateSave(fp, opValue, status); 1.2915 + break; 1.2916 + 1.2917 + 1.2918 + case URX_END: 1.2919 + // The match loop will exit via this path on a successful match, 1.2920 + // when we reach the end of the pattern. 1.2921 + if (toEnd && fp->fInputIdx != fActiveLimit) { 1.2922 + // The pattern matched, but not to the end of input. Try some more. 1.2923 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.2924 + break; 1.2925 + } 1.2926 + isMatch = TRUE; 1.2927 + goto breakFromLoop; 1.2928 + 1.2929 + // Start and End Capture stack frame variables are laid out out like this: 1.2930 + // fp->fExtra[opValue] - The start of a completed capture group 1.2931 + // opValue+1 - The end of a completed capture group 1.2932 + // opValue+2 - the start of a capture group whose end 1.2933 + // has not yet been reached (and might not ever be). 1.2934 + case URX_START_CAPTURE: 1.2935 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1.2936 + fp->fExtra[opValue+2] = fp->fInputIdx; 1.2937 + break; 1.2938 + 1.2939 + 1.2940 + case URX_END_CAPTURE: 1.2941 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1.2942 + U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 1.2943 + fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 1.2944 + fp->fExtra[opValue+1] = fp->fInputIdx; // End position 1.2945 + U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 1.2946 + break; 1.2947 + 1.2948 + 1.2949 + case URX_DOLLAR: // $, test for End of line 1.2950 + // or for position before new line at end of input 1.2951 + { 1.2952 + if (fp->fInputIdx >= fAnchorLimit) { 1.2953 + // We really are at the end of input. Success. 1.2954 + fHitEnd = TRUE; 1.2955 + fRequireEnd = TRUE; 1.2956 + break; 1.2957 + } 1.2958 + 1.2959 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.2960 + 1.2961 + // If we are positioned just before a new-line that is located at the 1.2962 + // end of input, succeed. 1.2963 + UChar32 c = UTEXT_NEXT32(fInputText); 1.2964 + if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 1.2965 + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 1.2966 + // If not in the middle of a CR/LF sequence 1.2967 + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 1.2968 + // At new-line at end of input. Success 1.2969 + fHitEnd = TRUE; 1.2970 + fRequireEnd = TRUE; 1.2971 + 1.2972 + break; 1.2973 + } 1.2974 + } 1.2975 + } else { 1.2976 + UChar32 nextC = UTEXT_NEXT32(fInputText); 1.2977 + if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 1.2978 + fHitEnd = TRUE; 1.2979 + fRequireEnd = TRUE; 1.2980 + break; // At CR/LF at end of input. Success 1.2981 + } 1.2982 + } 1.2983 + 1.2984 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.2985 + } 1.2986 + break; 1.2987 + 1.2988 + 1.2989 + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 1.2990 + if (fp->fInputIdx >= fAnchorLimit) { 1.2991 + // Off the end of input. Success. 1.2992 + fHitEnd = TRUE; 1.2993 + fRequireEnd = TRUE; 1.2994 + break; 1.2995 + } else { 1.2996 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.2997 + UChar32 c = UTEXT_NEXT32(fInputText); 1.2998 + // Either at the last character of input, or off the end. 1.2999 + if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 1.3000 + fHitEnd = TRUE; 1.3001 + fRequireEnd = TRUE; 1.3002 + break; 1.3003 + } 1.3004 + } 1.3005 + 1.3006 + // Not at end of input. Back-track out. 1.3007 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3008 + break; 1.3009 + 1.3010 + 1.3011 + case URX_DOLLAR_M: // $, test for End of line in multi-line mode 1.3012 + { 1.3013 + if (fp->fInputIdx >= fAnchorLimit) { 1.3014 + // We really are at the end of input. Success. 1.3015 + fHitEnd = TRUE; 1.3016 + fRequireEnd = TRUE; 1.3017 + break; 1.3018 + } 1.3019 + // If we are positioned just before a new-line, succeed. 1.3020 + // It makes no difference where the new-line is within the input. 1.3021 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3022 + UChar32 c = UTEXT_CURRENT32(fInputText); 1.3023 + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 1.3024 + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 1.3025 + // In multi-line mode, hitting a new-line just before the end of input does not 1.3026 + // set the hitEnd or requireEnd flags 1.3027 + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 1.3028 + break; 1.3029 + } 1.3030 + } 1.3031 + // not at a new line. Fail. 1.3032 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3033 + } 1.3034 + break; 1.3035 + 1.3036 + 1.3037 + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 1.3038 + { 1.3039 + if (fp->fInputIdx >= fAnchorLimit) { 1.3040 + // We really are at the end of input. Success. 1.3041 + fHitEnd = TRUE; 1.3042 + fRequireEnd = TRUE; // Java set requireEnd in this case, even though 1.3043 + break; // adding a new-line would not lose the match. 1.3044 + } 1.3045 + // If we are not positioned just before a new-line, the test fails; backtrack out. 1.3046 + // It makes no difference where the new-line is within the input. 1.3047 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3048 + if (UTEXT_CURRENT32(fInputText) != 0x0a) { 1.3049 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3050 + } 1.3051 + } 1.3052 + break; 1.3053 + 1.3054 + 1.3055 + case URX_CARET: // ^, test for start of line 1.3056 + if (fp->fInputIdx != fAnchorStart) { 1.3057 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3058 + } 1.3059 + break; 1.3060 + 1.3061 + 1.3062 + case URX_CARET_M: // ^, test for start of line in mulit-line mode 1.3063 + { 1.3064 + if (fp->fInputIdx == fAnchorStart) { 1.3065 + // We are at the start input. Success. 1.3066 + break; 1.3067 + } 1.3068 + // Check whether character just before the current pos is a new-line 1.3069 + // unless we are at the end of input 1.3070 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3071 + UChar32 c = UTEXT_PREVIOUS32(fInputText); 1.3072 + if ((fp->fInputIdx < fAnchorLimit) && 1.3073 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 1.3074 + // It's a new-line. ^ is true. Success. 1.3075 + // TODO: what should be done with positions between a CR and LF? 1.3076 + break; 1.3077 + } 1.3078 + // Not at the start of a line. Fail. 1.3079 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3080 + } 1.3081 + break; 1.3082 + 1.3083 + 1.3084 + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 1.3085 + { 1.3086 + U_ASSERT(fp->fInputIdx >= fAnchorStart); 1.3087 + if (fp->fInputIdx <= fAnchorStart) { 1.3088 + // We are at the start input. Success. 1.3089 + break; 1.3090 + } 1.3091 + // Check whether character just before the current pos is a new-line 1.3092 + U_ASSERT(fp->fInputIdx <= fAnchorLimit); 1.3093 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3094 + UChar32 c = UTEXT_PREVIOUS32(fInputText); 1.3095 + if (c != 0x0a) { 1.3096 + // Not at the start of a line. Back-track out. 1.3097 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3098 + } 1.3099 + } 1.3100 + break; 1.3101 + 1.3102 + case URX_BACKSLASH_B: // Test for word boundaries 1.3103 + { 1.3104 + UBool success = isWordBoundary(fp->fInputIdx); 1.3105 + success ^= (UBool)(opValue != 0); // flip sense for \B 1.3106 + if (!success) { 1.3107 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3108 + } 1.3109 + } 1.3110 + break; 1.3111 + 1.3112 + 1.3113 + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 1.3114 + { 1.3115 + UBool success = isUWordBoundary(fp->fInputIdx); 1.3116 + success ^= (UBool)(opValue != 0); // flip sense for \B 1.3117 + if (!success) { 1.3118 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3119 + } 1.3120 + } 1.3121 + break; 1.3122 + 1.3123 + 1.3124 + case URX_BACKSLASH_D: // Test for decimal digit 1.3125 + { 1.3126 + if (fp->fInputIdx >= fActiveLimit) { 1.3127 + fHitEnd = TRUE; 1.3128 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3129 + break; 1.3130 + } 1.3131 + 1.3132 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3133 + 1.3134 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3135 + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 1.3136 + UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 1.3137 + success ^= (UBool)(opValue != 0); // flip sense for \D 1.3138 + if (success) { 1.3139 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3140 + } else { 1.3141 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3142 + } 1.3143 + } 1.3144 + break; 1.3145 + 1.3146 + 1.3147 + case URX_BACKSLASH_G: // Test for position at end of previous match 1.3148 + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 1.3149 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3150 + } 1.3151 + break; 1.3152 + 1.3153 + 1.3154 + case URX_BACKSLASH_X: 1.3155 + // Match a Grapheme, as defined by Unicode TR 29. 1.3156 + // Differs slightly from Perl, which consumes combining marks independently 1.3157 + // of context. 1.3158 + { 1.3159 + 1.3160 + // Fail if at end of input 1.3161 + if (fp->fInputIdx >= fActiveLimit) { 1.3162 + fHitEnd = TRUE; 1.3163 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3164 + break; 1.3165 + } 1.3166 + 1.3167 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3168 + 1.3169 + // Examine (and consume) the current char. 1.3170 + // Dispatch into a little state machine, based on the char. 1.3171 + UChar32 c; 1.3172 + c = UTEXT_NEXT32(fInputText); 1.3173 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3174 + UnicodeSet **sets = fPattern->fStaticSets; 1.3175 + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 1.3176 + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 1.3177 + if (sets[URX_GC_L]->contains(c)) goto GC_L; 1.3178 + if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1.3179 + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1.3180 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.3181 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.3182 + goto GC_Extend; 1.3183 + 1.3184 + 1.3185 + 1.3186 +GC_L: 1.3187 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.3188 + c = UTEXT_NEXT32(fInputText); 1.3189 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3190 + if (sets[URX_GC_L]->contains(c)) goto GC_L; 1.3191 + if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1.3192 + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1.3193 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.3194 + (void)UTEXT_PREVIOUS32(fInputText); 1.3195 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3196 + goto GC_Extend; 1.3197 + 1.3198 +GC_V: 1.3199 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.3200 + c = UTEXT_NEXT32(fInputText); 1.3201 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3202 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.3203 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.3204 + (void)UTEXT_PREVIOUS32(fInputText); 1.3205 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3206 + goto GC_Extend; 1.3207 + 1.3208 +GC_T: 1.3209 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.3210 + c = UTEXT_NEXT32(fInputText); 1.3211 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3212 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.3213 + (void)UTEXT_PREVIOUS32(fInputText); 1.3214 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3215 + goto GC_Extend; 1.3216 + 1.3217 +GC_Extend: 1.3218 + // Combining characters are consumed here 1.3219 + for (;;) { 1.3220 + if (fp->fInputIdx >= fActiveLimit) { 1.3221 + break; 1.3222 + } 1.3223 + c = UTEXT_CURRENT32(fInputText); 1.3224 + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 1.3225 + break; 1.3226 + } 1.3227 + (void)UTEXT_NEXT32(fInputText); 1.3228 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3229 + } 1.3230 + goto GC_Done; 1.3231 + 1.3232 +GC_Control: 1.3233 + // Most control chars stand alone (don't combine with combining chars), 1.3234 + // except for that CR/LF sequence is a single grapheme cluster. 1.3235 + if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 1.3236 + c = UTEXT_NEXT32(fInputText); 1.3237 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3238 + } 1.3239 + 1.3240 +GC_Done: 1.3241 + if (fp->fInputIdx >= fActiveLimit) { 1.3242 + fHitEnd = TRUE; 1.3243 + } 1.3244 + break; 1.3245 + } 1.3246 + 1.3247 + 1.3248 + 1.3249 + 1.3250 + case URX_BACKSLASH_Z: // Test for end of Input 1.3251 + if (fp->fInputIdx < fAnchorLimit) { 1.3252 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3253 + } else { 1.3254 + fHitEnd = TRUE; 1.3255 + fRequireEnd = TRUE; 1.3256 + } 1.3257 + break; 1.3258 + 1.3259 + 1.3260 + 1.3261 + case URX_STATIC_SETREF: 1.3262 + { 1.3263 + // Test input character against one of the predefined sets 1.3264 + // (Word Characters, for example) 1.3265 + // The high bit of the op value is a flag for the match polarity. 1.3266 + // 0: success if input char is in set. 1.3267 + // 1: success if input char is not in set. 1.3268 + if (fp->fInputIdx >= fActiveLimit) { 1.3269 + fHitEnd = TRUE; 1.3270 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3271 + break; 1.3272 + } 1.3273 + 1.3274 + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 1.3275 + opValue &= ~URX_NEG_SET; 1.3276 + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1.3277 + 1.3278 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3279 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3280 + if (c < 256) { 1.3281 + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1.3282 + if (s8->contains(c)) { 1.3283 + success = !success; 1.3284 + } 1.3285 + } else { 1.3286 + const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1.3287 + if (s->contains(c)) { 1.3288 + success = !success; 1.3289 + } 1.3290 + } 1.3291 + if (success) { 1.3292 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3293 + } else { 1.3294 + // the character wasn't in the set. 1.3295 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3296 + } 1.3297 + } 1.3298 + break; 1.3299 + 1.3300 + 1.3301 + case URX_STAT_SETREF_N: 1.3302 + { 1.3303 + // Test input character for NOT being a member of one of 1.3304 + // the predefined sets (Word Characters, for example) 1.3305 + if (fp->fInputIdx >= fActiveLimit) { 1.3306 + fHitEnd = TRUE; 1.3307 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3308 + break; 1.3309 + } 1.3310 + 1.3311 + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1.3312 + 1.3313 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3314 + 1.3315 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3316 + if (c < 256) { 1.3317 + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1.3318 + if (s8->contains(c) == FALSE) { 1.3319 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3320 + break; 1.3321 + } 1.3322 + } else { 1.3323 + const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1.3324 + if (s->contains(c) == FALSE) { 1.3325 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3326 + break; 1.3327 + } 1.3328 + } 1.3329 + // the character wasn't in the set. 1.3330 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3331 + } 1.3332 + break; 1.3333 + 1.3334 + 1.3335 + case URX_SETREF: 1.3336 + if (fp->fInputIdx >= fActiveLimit) { 1.3337 + fHitEnd = TRUE; 1.3338 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3339 + break; 1.3340 + } else { 1.3341 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3342 + 1.3343 + // There is input left. Pick up one char and test it for set membership. 1.3344 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3345 + U_ASSERT(opValue > 0 && opValue < sets->size()); 1.3346 + if (c<256) { 1.3347 + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 1.3348 + if (s8->contains(c)) { 1.3349 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3350 + break; 1.3351 + } 1.3352 + } else { 1.3353 + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 1.3354 + if (s->contains(c)) { 1.3355 + // The character is in the set. A Match. 1.3356 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3357 + break; 1.3358 + } 1.3359 + } 1.3360 + 1.3361 + // the character wasn't in the set. 1.3362 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3363 + } 1.3364 + break; 1.3365 + 1.3366 + 1.3367 + case URX_DOTANY: 1.3368 + { 1.3369 + // . matches anything, but stops at end-of-line. 1.3370 + if (fp->fInputIdx >= fActiveLimit) { 1.3371 + // At end of input. Match failed. Backtrack out. 1.3372 + fHitEnd = TRUE; 1.3373 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3374 + break; 1.3375 + } 1.3376 + 1.3377 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3378 + 1.3379 + // There is input left. Advance over one char, unless we've hit end-of-line 1.3380 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3381 + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1.3382 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 1.3383 + // End of line in normal mode. . does not match. 1.3384 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3385 + break; 1.3386 + } 1.3387 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3388 + } 1.3389 + break; 1.3390 + 1.3391 + 1.3392 + case URX_DOTANY_ALL: 1.3393 + { 1.3394 + // ., in dot-matches-all (including new lines) mode 1.3395 + if (fp->fInputIdx >= fActiveLimit) { 1.3396 + // At end of input. Match failed. Backtrack out. 1.3397 + fHitEnd = TRUE; 1.3398 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3399 + break; 1.3400 + } 1.3401 + 1.3402 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3403 + 1.3404 + // There is input left. Advance over one char, except if we are 1.3405 + // at a cr/lf, advance over both of them. 1.3406 + UChar32 c; 1.3407 + c = UTEXT_NEXT32(fInputText); 1.3408 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3409 + if (c==0x0d && fp->fInputIdx < fActiveLimit) { 1.3410 + // In the case of a CR/LF, we need to advance over both. 1.3411 + UChar32 nextc = UTEXT_CURRENT32(fInputText); 1.3412 + if (nextc == 0x0a) { 1.3413 + (void)UTEXT_NEXT32(fInputText); 1.3414 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3415 + } 1.3416 + } 1.3417 + } 1.3418 + break; 1.3419 + 1.3420 + 1.3421 + case URX_DOTANY_UNIX: 1.3422 + { 1.3423 + // '.' operator, matches all, but stops at end-of-line. 1.3424 + // UNIX_LINES mode, so 0x0a is the only recognized line ending. 1.3425 + if (fp->fInputIdx >= fActiveLimit) { 1.3426 + // At end of input. Match failed. Backtrack out. 1.3427 + fHitEnd = TRUE; 1.3428 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3429 + break; 1.3430 + } 1.3431 + 1.3432 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3433 + 1.3434 + // There is input left. Advance over one char, unless we've hit end-of-line 1.3435 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3436 + if (c == 0x0a) { 1.3437 + // End of line in normal mode. '.' does not match the \n 1.3438 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3439 + } else { 1.3440 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3441 + } 1.3442 + } 1.3443 + break; 1.3444 + 1.3445 + 1.3446 + case URX_JMP: 1.3447 + fp->fPatIdx = opValue; 1.3448 + break; 1.3449 + 1.3450 + case URX_FAIL: 1.3451 + isMatch = FALSE; 1.3452 + goto breakFromLoop; 1.3453 + 1.3454 + case URX_JMP_SAV: 1.3455 + U_ASSERT(opValue < fPattern->fCompiledPat->size()); 1.3456 + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 1.3457 + fp->fPatIdx = opValue; // Then JMP. 1.3458 + break; 1.3459 + 1.3460 + case URX_JMP_SAV_X: 1.3461 + // This opcode is used with (x)+, when x can match a zero length string. 1.3462 + // Same as JMP_SAV, except conditional on the match having made forward progress. 1.3463 + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 1.3464 + // data address of the input position at the start of the loop. 1.3465 + { 1.3466 + U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 1.3467 + int32_t stoOp = (int32_t)pat[opValue-1]; 1.3468 + U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 1.3469 + int32_t frameLoc = URX_VAL(stoOp); 1.3470 + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 1.3471 + int64_t prevInputIdx = fp->fExtra[frameLoc]; 1.3472 + U_ASSERT(prevInputIdx <= fp->fInputIdx); 1.3473 + if (prevInputIdx < fp->fInputIdx) { 1.3474 + // The match did make progress. Repeat the loop. 1.3475 + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 1.3476 + fp->fPatIdx = opValue; 1.3477 + fp->fExtra[frameLoc] = fp->fInputIdx; 1.3478 + } 1.3479 + // If the input position did not advance, we do nothing here, 1.3480 + // execution will fall out of the loop. 1.3481 + } 1.3482 + break; 1.3483 + 1.3484 + case URX_CTR_INIT: 1.3485 + { 1.3486 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 1.3487 + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 1.3488 + 1.3489 + // Pick up the three extra operands that CTR_INIT has, and 1.3490 + // skip the pattern location counter past 1.3491 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.3492 + fp->fPatIdx += 3; 1.3493 + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 1.3494 + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 1.3495 + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 1.3496 + U_ASSERT(minCount>=0); 1.3497 + U_ASSERT(maxCount>=minCount || maxCount==-1); 1.3498 + U_ASSERT(loopLoc>=fp->fPatIdx); 1.3499 + 1.3500 + if (minCount == 0) { 1.3501 + fp = StateSave(fp, loopLoc+1, status); 1.3502 + } 1.3503 + if (maxCount == -1) { 1.3504 + fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 1.3505 + } else if (maxCount == 0) { 1.3506 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3507 + } 1.3508 + } 1.3509 + break; 1.3510 + 1.3511 + case URX_CTR_LOOP: 1.3512 + { 1.3513 + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 1.3514 + int32_t initOp = (int32_t)pat[opValue]; 1.3515 + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 1.3516 + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 1.3517 + int32_t minCount = (int32_t)pat[opValue+2]; 1.3518 + int32_t maxCount = (int32_t)pat[opValue+3]; 1.3519 + (*pCounter)++; 1.3520 + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 1.3521 + U_ASSERT(*pCounter == maxCount); 1.3522 + break; 1.3523 + } 1.3524 + if (*pCounter >= minCount) { 1.3525 + if (maxCount == -1) { 1.3526 + // Loop has no hard upper bound. 1.3527 + // Check that it is progressing through the input, break if it is not. 1.3528 + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 1.3529 + if (fp->fInputIdx == *pLastInputIdx) { 1.3530 + break; 1.3531 + } else { 1.3532 + *pLastInputIdx = fp->fInputIdx; 1.3533 + } 1.3534 + } 1.3535 + fp = StateSave(fp, fp->fPatIdx, status); 1.3536 + } 1.3537 + fp->fPatIdx = opValue + 4; // Loop back. 1.3538 + } 1.3539 + break; 1.3540 + 1.3541 + case URX_CTR_INIT_NG: 1.3542 + { 1.3543 + // Initialize a non-greedy loop 1.3544 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 1.3545 + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 1.3546 + 1.3547 + // Pick up the three extra operands that CTR_INIT_NG has, and 1.3548 + // skip the pattern location counter past 1.3549 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.3550 + fp->fPatIdx += 3; 1.3551 + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 1.3552 + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 1.3553 + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 1.3554 + U_ASSERT(minCount>=0); 1.3555 + U_ASSERT(maxCount>=minCount || maxCount==-1); 1.3556 + U_ASSERT(loopLoc>fp->fPatIdx); 1.3557 + if (maxCount == -1) { 1.3558 + fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 1.3559 + } 1.3560 + 1.3561 + if (minCount == 0) { 1.3562 + if (maxCount != 0) { 1.3563 + fp = StateSave(fp, fp->fPatIdx, status); 1.3564 + } 1.3565 + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 1.3566 + } 1.3567 + } 1.3568 + break; 1.3569 + 1.3570 + case URX_CTR_LOOP_NG: 1.3571 + { 1.3572 + // Non-greedy {min, max} loops 1.3573 + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 1.3574 + int32_t initOp = (int32_t)pat[opValue]; 1.3575 + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 1.3576 + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 1.3577 + int32_t minCount = (int32_t)pat[opValue+2]; 1.3578 + int32_t maxCount = (int32_t)pat[opValue+3]; 1.3579 + 1.3580 + (*pCounter)++; 1.3581 + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 1.3582 + // The loop has matched the maximum permitted number of times. 1.3583 + // Break out of here with no action. Matching will 1.3584 + // continue with the following pattern. 1.3585 + U_ASSERT(*pCounter == maxCount); 1.3586 + break; 1.3587 + } 1.3588 + 1.3589 + if (*pCounter < minCount) { 1.3590 + // We haven't met the minimum number of matches yet. 1.3591 + // Loop back for another one. 1.3592 + fp->fPatIdx = opValue + 4; // Loop back. 1.3593 + } else { 1.3594 + // We do have the minimum number of matches. 1.3595 + 1.3596 + // If there is no upper bound on the loop iterations, check that the input index 1.3597 + // is progressing, and stop the loop if it is not. 1.3598 + if (maxCount == -1) { 1.3599 + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 1.3600 + if (fp->fInputIdx == *pLastInputIdx) { 1.3601 + break; 1.3602 + } 1.3603 + *pLastInputIdx = fp->fInputIdx; 1.3604 + } 1.3605 + 1.3606 + // Loop Continuation: we will fall into the pattern following the loop 1.3607 + // (non-greedy, don't execute loop body first), but first do 1.3608 + // a state save to the top of the loop, so that a match failure 1.3609 + // in the following pattern will try another iteration of the loop. 1.3610 + fp = StateSave(fp, opValue + 4, status); 1.3611 + } 1.3612 + } 1.3613 + break; 1.3614 + 1.3615 + case URX_STO_SP: 1.3616 + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 1.3617 + fData[opValue] = fStack->size(); 1.3618 + break; 1.3619 + 1.3620 + case URX_LD_SP: 1.3621 + { 1.3622 + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 1.3623 + int32_t newStackSize = (int32_t)fData[opValue]; 1.3624 + U_ASSERT(newStackSize <= fStack->size()); 1.3625 + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 1.3626 + if (newFP == (int64_t *)fp) { 1.3627 + break; 1.3628 + } 1.3629 + int32_t i; 1.3630 + for (i=0; i<fFrameSize; i++) { 1.3631 + newFP[i] = ((int64_t *)fp)[i]; 1.3632 + } 1.3633 + fp = (REStackFrame *)newFP; 1.3634 + fStack->setSize(newStackSize); 1.3635 + } 1.3636 + break; 1.3637 + 1.3638 + case URX_BACKREF: 1.3639 + { 1.3640 + U_ASSERT(opValue < fFrameSize); 1.3641 + int64_t groupStartIdx = fp->fExtra[opValue]; 1.3642 + int64_t groupEndIdx = fp->fExtra[opValue+1]; 1.3643 + U_ASSERT(groupStartIdx <= groupEndIdx); 1.3644 + if (groupStartIdx < 0) { 1.3645 + // This capture group has not participated in the match thus far, 1.3646 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 1.3647 + break; 1.3648 + } 1.3649 + UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 1.3650 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3651 + 1.3652 + // Note: if the capture group match was of an empty string the backref 1.3653 + // match succeeds. Verified by testing: Perl matches succeed 1.3654 + // in this case, so we do too. 1.3655 + 1.3656 + UBool success = TRUE; 1.3657 + for (;;) { 1.3658 + if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 1.3659 + success = TRUE; 1.3660 + break; 1.3661 + } 1.3662 + if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 1.3663 + success = FALSE; 1.3664 + fHitEnd = TRUE; 1.3665 + break; 1.3666 + } 1.3667 + UChar32 captureGroupChar = utext_next32(fAltInputText); 1.3668 + UChar32 inputChar = utext_next32(fInputText); 1.3669 + if (inputChar != captureGroupChar) { 1.3670 + success = FALSE; 1.3671 + break; 1.3672 + } 1.3673 + } 1.3674 + 1.3675 + if (success) { 1.3676 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3677 + } else { 1.3678 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3679 + } 1.3680 + } 1.3681 + break; 1.3682 + 1.3683 + 1.3684 + 1.3685 + case URX_BACKREF_I: 1.3686 + { 1.3687 + U_ASSERT(opValue < fFrameSize); 1.3688 + int64_t groupStartIdx = fp->fExtra[opValue]; 1.3689 + int64_t groupEndIdx = fp->fExtra[opValue+1]; 1.3690 + U_ASSERT(groupStartIdx <= groupEndIdx); 1.3691 + if (groupStartIdx < 0) { 1.3692 + // This capture group has not participated in the match thus far, 1.3693 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 1.3694 + break; 1.3695 + } 1.3696 + utext_setNativeIndex(fAltInputText, groupStartIdx); 1.3697 + utext_setNativeIndex(fInputText, fp->fInputIdx); 1.3698 + CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 1.3699 + CaseFoldingUTextIterator inputItr(*fInputText); 1.3700 + 1.3701 + // Note: if the capture group match was of an empty string the backref 1.3702 + // match succeeds. Verified by testing: Perl matches succeed 1.3703 + // in this case, so we do too. 1.3704 + 1.3705 + UBool success = TRUE; 1.3706 + for (;;) { 1.3707 + if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 1.3708 + success = TRUE; 1.3709 + break; 1.3710 + } 1.3711 + if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 1.3712 + success = FALSE; 1.3713 + fHitEnd = TRUE; 1.3714 + break; 1.3715 + } 1.3716 + UChar32 captureGroupChar = captureGroupItr.next(); 1.3717 + UChar32 inputChar = inputItr.next(); 1.3718 + if (inputChar != captureGroupChar) { 1.3719 + success = FALSE; 1.3720 + break; 1.3721 + } 1.3722 + } 1.3723 + 1.3724 + if (success && inputItr.inExpansion()) { 1.3725 + // We otained a match by consuming part of a string obtained from 1.3726 + // case-folding a single code point of the input text. 1.3727 + // This does not count as an overall match. 1.3728 + success = FALSE; 1.3729 + } 1.3730 + 1.3731 + if (success) { 1.3732 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3733 + } else { 1.3734 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3735 + } 1.3736 + 1.3737 + } 1.3738 + break; 1.3739 + 1.3740 + case URX_STO_INP_LOC: 1.3741 + { 1.3742 + U_ASSERT(opValue >= 0 && opValue < fFrameSize); 1.3743 + fp->fExtra[opValue] = fp->fInputIdx; 1.3744 + } 1.3745 + break; 1.3746 + 1.3747 + case URX_JMPX: 1.3748 + { 1.3749 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.3750 + fp->fPatIdx += 1; 1.3751 + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 1.3752 + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 1.3753 + int64_t savedInputIdx = fp->fExtra[dataLoc]; 1.3754 + U_ASSERT(savedInputIdx <= fp->fInputIdx); 1.3755 + if (savedInputIdx < fp->fInputIdx) { 1.3756 + fp->fPatIdx = opValue; // JMP 1.3757 + } else { 1.3758 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 1.3759 + } 1.3760 + } 1.3761 + break; 1.3762 + 1.3763 + case URX_LA_START: 1.3764 + { 1.3765 + // Entering a lookahead block. 1.3766 + // Save Stack Ptr, Input Pos. 1.3767 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3768 + fData[opValue] = fStack->size(); 1.3769 + fData[opValue+1] = fp->fInputIdx; 1.3770 + fActiveStart = fLookStart; // Set the match region change for 1.3771 + fActiveLimit = fLookLimit; // transparent bounds. 1.3772 + } 1.3773 + break; 1.3774 + 1.3775 + case URX_LA_END: 1.3776 + { 1.3777 + // Leaving a look-ahead block. 1.3778 + // restore Stack Ptr, Input Pos to positions they had on entry to block. 1.3779 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3780 + int32_t stackSize = fStack->size(); 1.3781 + int32_t newStackSize =(int32_t)fData[opValue]; 1.3782 + U_ASSERT(stackSize >= newStackSize); 1.3783 + if (stackSize > newStackSize) { 1.3784 + // Copy the current top frame back to the new (cut back) top frame. 1.3785 + // This makes the capture groups from within the look-ahead 1.3786 + // expression available. 1.3787 + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 1.3788 + int32_t i; 1.3789 + for (i=0; i<fFrameSize; i++) { 1.3790 + newFP[i] = ((int64_t *)fp)[i]; 1.3791 + } 1.3792 + fp = (REStackFrame *)newFP; 1.3793 + fStack->setSize(newStackSize); 1.3794 + } 1.3795 + fp->fInputIdx = fData[opValue+1]; 1.3796 + 1.3797 + // Restore the active region bounds in the input string; they may have 1.3798 + // been changed because of transparent bounds on a Region. 1.3799 + fActiveStart = fRegionStart; 1.3800 + fActiveLimit = fRegionLimit; 1.3801 + } 1.3802 + break; 1.3803 + 1.3804 + case URX_ONECHAR_I: 1.3805 + // Case insensitive one char. The char from the pattern is already case folded. 1.3806 + // Input text is not, but case folding the input can not reduce two or more code 1.3807 + // points to one. 1.3808 + if (fp->fInputIdx < fActiveLimit) { 1.3809 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3810 + 1.3811 + UChar32 c = UTEXT_NEXT32(fInputText); 1.3812 + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 1.3813 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3814 + break; 1.3815 + } 1.3816 + } else { 1.3817 + fHitEnd = TRUE; 1.3818 + } 1.3819 + 1.3820 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3821 + break; 1.3822 + 1.3823 + case URX_STRING_I: 1.3824 + { 1.3825 + // Case-insensitive test input against a literal string. 1.3826 + // Strings require two slots in the compiled pattern, one for the 1.3827 + // offset to the string text, and one for the length. 1.3828 + // The compiled string has already been case folded. 1.3829 + { 1.3830 + const UChar *patternString = litText + opValue; 1.3831 + int32_t patternStringIdx = 0; 1.3832 + 1.3833 + op = (int32_t)pat[fp->fPatIdx]; 1.3834 + fp->fPatIdx++; 1.3835 + opType = URX_TYPE(op); 1.3836 + opValue = URX_VAL(op); 1.3837 + U_ASSERT(opType == URX_STRING_LEN); 1.3838 + int32_t patternStringLen = opValue; // Length of the string from the pattern. 1.3839 + 1.3840 + 1.3841 + UChar32 cPattern; 1.3842 + UChar32 cText; 1.3843 + UBool success = TRUE; 1.3844 + 1.3845 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.3846 + CaseFoldingUTextIterator inputIterator(*fInputText); 1.3847 + while (patternStringIdx < patternStringLen) { 1.3848 + if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 1.3849 + success = FALSE; 1.3850 + fHitEnd = TRUE; 1.3851 + break; 1.3852 + } 1.3853 + U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 1.3854 + cText = inputIterator.next(); 1.3855 + if (cText != cPattern) { 1.3856 + success = FALSE; 1.3857 + break; 1.3858 + } 1.3859 + } 1.3860 + if (inputIterator.inExpansion()) { 1.3861 + success = FALSE; 1.3862 + } 1.3863 + 1.3864 + if (success) { 1.3865 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3866 + } else { 1.3867 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3868 + } 1.3869 + } 1.3870 + } 1.3871 + break; 1.3872 + 1.3873 + case URX_LB_START: 1.3874 + { 1.3875 + // Entering a look-behind block. 1.3876 + // Save Stack Ptr, Input Pos. 1.3877 + // TODO: implement transparent bounds. Ticket #6067 1.3878 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3879 + fData[opValue] = fStack->size(); 1.3880 + fData[opValue+1] = fp->fInputIdx; 1.3881 + // Init the variable containing the start index for attempted matches. 1.3882 + fData[opValue+2] = -1; 1.3883 + // Save input string length, then reset to pin any matches to end at 1.3884 + // the current position. 1.3885 + fData[opValue+3] = fActiveLimit; 1.3886 + fActiveLimit = fp->fInputIdx; 1.3887 + } 1.3888 + break; 1.3889 + 1.3890 + 1.3891 + case URX_LB_CONT: 1.3892 + { 1.3893 + // Positive Look-Behind, at top of loop checking for matches of LB expression 1.3894 + // at all possible input starting positions. 1.3895 + 1.3896 + // Fetch the min and max possible match lengths. They are the operands 1.3897 + // of this op in the pattern. 1.3898 + int32_t minML = (int32_t)pat[fp->fPatIdx++]; 1.3899 + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 1.3900 + U_ASSERT(minML <= maxML); 1.3901 + U_ASSERT(minML >= 0); 1.3902 + 1.3903 + // Fetch (from data) the last input index where a match was attempted. 1.3904 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3905 + int64_t *lbStartIdx = &fData[opValue+2]; 1.3906 + if (*lbStartIdx < 0) { 1.3907 + // First time through loop. 1.3908 + *lbStartIdx = fp->fInputIdx - minML; 1.3909 + } else { 1.3910 + // 2nd through nth time through the loop. 1.3911 + // Back up start position for match by one. 1.3912 + if (*lbStartIdx == 0) { 1.3913 + (*lbStartIdx)--; 1.3914 + } else { 1.3915 + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 1.3916 + (void)UTEXT_PREVIOUS32(fInputText); 1.3917 + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3918 + } 1.3919 + } 1.3920 + 1.3921 + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 1.3922 + // We have tried all potential match starting points without 1.3923 + // getting a match. Backtrack out, and out of the 1.3924 + // Look Behind altogether. 1.3925 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3926 + int64_t restoreInputLen = fData[opValue+3]; 1.3927 + U_ASSERT(restoreInputLen >= fActiveLimit); 1.3928 + U_ASSERT(restoreInputLen <= fInputLength); 1.3929 + fActiveLimit = restoreInputLen; 1.3930 + break; 1.3931 + } 1.3932 + 1.3933 + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 1.3934 + // (successful match will fall off the end of the loop.) 1.3935 + fp = StateSave(fp, fp->fPatIdx-3, status); 1.3936 + fp->fInputIdx = *lbStartIdx; 1.3937 + } 1.3938 + break; 1.3939 + 1.3940 + case URX_LB_END: 1.3941 + // End of a look-behind block, after a successful match. 1.3942 + { 1.3943 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3944 + if (fp->fInputIdx != fActiveLimit) { 1.3945 + // The look-behind expression matched, but the match did not 1.3946 + // extend all the way to the point that we are looking behind from. 1.3947 + // FAIL out of here, which will take us back to the LB_CONT, which 1.3948 + // will retry the match starting at another position or fail 1.3949 + // the look-behind altogether, whichever is appropriate. 1.3950 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.3951 + break; 1.3952 + } 1.3953 + 1.3954 + // Look-behind match is good. Restore the orignal input string length, 1.3955 + // which had been truncated to pin the end of the lookbehind match to the 1.3956 + // position being looked-behind. 1.3957 + int64_t originalInputLen = fData[opValue+3]; 1.3958 + U_ASSERT(originalInputLen >= fActiveLimit); 1.3959 + U_ASSERT(originalInputLen <= fInputLength); 1.3960 + fActiveLimit = originalInputLen; 1.3961 + } 1.3962 + break; 1.3963 + 1.3964 + 1.3965 + case URX_LBN_CONT: 1.3966 + { 1.3967 + // Negative Look-Behind, at top of loop checking for matches of LB expression 1.3968 + // at all possible input starting positions. 1.3969 + 1.3970 + // Fetch the extra parameters of this op. 1.3971 + int32_t minML = (int32_t)pat[fp->fPatIdx++]; 1.3972 + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 1.3973 + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 1.3974 + continueLoc = URX_VAL(continueLoc); 1.3975 + U_ASSERT(minML <= maxML); 1.3976 + U_ASSERT(minML >= 0); 1.3977 + U_ASSERT(continueLoc > fp->fPatIdx); 1.3978 + 1.3979 + // Fetch (from data) the last input index where a match was attempted. 1.3980 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.3981 + int64_t *lbStartIdx = &fData[opValue+2]; 1.3982 + if (*lbStartIdx < 0) { 1.3983 + // First time through loop. 1.3984 + *lbStartIdx = fp->fInputIdx - minML; 1.3985 + } else { 1.3986 + // 2nd through nth time through the loop. 1.3987 + // Back up start position for match by one. 1.3988 + if (*lbStartIdx == 0) { 1.3989 + (*lbStartIdx)--; 1.3990 + } else { 1.3991 + UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 1.3992 + (void)UTEXT_PREVIOUS32(fInputText); 1.3993 + *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.3994 + } 1.3995 + } 1.3996 + 1.3997 + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 1.3998 + // We have tried all potential match starting points without 1.3999 + // getting a match, which means that the negative lookbehind as 1.4000 + // a whole has succeeded. Jump forward to the continue location 1.4001 + int64_t restoreInputLen = fData[opValue+3]; 1.4002 + U_ASSERT(restoreInputLen >= fActiveLimit); 1.4003 + U_ASSERT(restoreInputLen <= fInputLength); 1.4004 + fActiveLimit = restoreInputLen; 1.4005 + fp->fPatIdx = continueLoc; 1.4006 + break; 1.4007 + } 1.4008 + 1.4009 + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 1.4010 + // (successful match will cause a FAIL out of the loop altogether.) 1.4011 + fp = StateSave(fp, fp->fPatIdx-4, status); 1.4012 + fp->fInputIdx = *lbStartIdx; 1.4013 + } 1.4014 + break; 1.4015 + 1.4016 + case URX_LBN_END: 1.4017 + // End of a negative look-behind block, after a successful match. 1.4018 + { 1.4019 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.4020 + if (fp->fInputIdx != fActiveLimit) { 1.4021 + // The look-behind expression matched, but the match did not 1.4022 + // extend all the way to the point that we are looking behind from. 1.4023 + // FAIL out of here, which will take us back to the LB_CONT, which 1.4024 + // will retry the match starting at another position or succeed 1.4025 + // the look-behind altogether, whichever is appropriate. 1.4026 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4027 + break; 1.4028 + } 1.4029 + 1.4030 + // Look-behind expression matched, which means look-behind test as 1.4031 + // a whole Fails 1.4032 + 1.4033 + // Restore the orignal input string length, which had been truncated 1.4034 + // inorder to pin the end of the lookbehind match 1.4035 + // to the position being looked-behind. 1.4036 + int64_t originalInputLen = fData[opValue+3]; 1.4037 + U_ASSERT(originalInputLen >= fActiveLimit); 1.4038 + U_ASSERT(originalInputLen <= fInputLength); 1.4039 + fActiveLimit = originalInputLen; 1.4040 + 1.4041 + // Restore original stack position, discarding any state saved 1.4042 + // by the successful pattern match. 1.4043 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.4044 + int32_t newStackSize = (int32_t)fData[opValue]; 1.4045 + U_ASSERT(fStack->size() > newStackSize); 1.4046 + fStack->setSize(newStackSize); 1.4047 + 1.4048 + // FAIL, which will take control back to someplace 1.4049 + // prior to entering the look-behind test. 1.4050 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4051 + } 1.4052 + break; 1.4053 + 1.4054 + 1.4055 + case URX_LOOP_SR_I: 1.4056 + // Loop Initialization for the optimized implementation of 1.4057 + // [some character set]* 1.4058 + // This op scans through all matching input. 1.4059 + // The following LOOP_C op emulates stack unwinding if the following pattern fails. 1.4060 + { 1.4061 + U_ASSERT(opValue > 0 && opValue < sets->size()); 1.4062 + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 1.4063 + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 1.4064 + 1.4065 + // Loop through input, until either the input is exhausted or 1.4066 + // we reach a character that is not a member of the set. 1.4067 + int64_t ix = fp->fInputIdx; 1.4068 + UTEXT_SETNATIVEINDEX(fInputText, ix); 1.4069 + for (;;) { 1.4070 + if (ix >= fActiveLimit) { 1.4071 + fHitEnd = TRUE; 1.4072 + break; 1.4073 + } 1.4074 + UChar32 c = UTEXT_NEXT32(fInputText); 1.4075 + if (c<256) { 1.4076 + if (s8->contains(c) == FALSE) { 1.4077 + break; 1.4078 + } 1.4079 + } else { 1.4080 + if (s->contains(c) == FALSE) { 1.4081 + break; 1.4082 + } 1.4083 + } 1.4084 + ix = UTEXT_GETNATIVEINDEX(fInputText); 1.4085 + } 1.4086 + 1.4087 + // If there were no matching characters, skip over the loop altogether. 1.4088 + // The loop doesn't run at all, a * op always succeeds. 1.4089 + if (ix == fp->fInputIdx) { 1.4090 + fp->fPatIdx++; // skip the URX_LOOP_C op. 1.4091 + break; 1.4092 + } 1.4093 + 1.4094 + // Peek ahead in the compiled pattern, to the URX_LOOP_C that 1.4095 + // must follow. It's operand is the stack location 1.4096 + // that holds the starting input index for the match of this [set]* 1.4097 + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 1.4098 + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 1.4099 + int32_t stackLoc = URX_VAL(loopcOp); 1.4100 + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 1.4101 + fp->fExtra[stackLoc] = fp->fInputIdx; 1.4102 + fp->fInputIdx = ix; 1.4103 + 1.4104 + // Save State to the URX_LOOP_C op that follows this one, 1.4105 + // so that match failures in the following code will return to there. 1.4106 + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 1.4107 + fp = StateSave(fp, fp->fPatIdx, status); 1.4108 + fp->fPatIdx++; 1.4109 + } 1.4110 + break; 1.4111 + 1.4112 + 1.4113 + case URX_LOOP_DOT_I: 1.4114 + // Loop Initialization for the optimized implementation of .* 1.4115 + // This op scans through all remaining input. 1.4116 + // The following LOOP_C op emulates stack unwinding if the following pattern fails. 1.4117 + { 1.4118 + // Loop through input until the input is exhausted (we reach an end-of-line) 1.4119 + // In DOTALL mode, we can just go straight to the end of the input. 1.4120 + int64_t ix; 1.4121 + if ((opValue & 1) == 1) { 1.4122 + // Dot-matches-All mode. Jump straight to the end of the string. 1.4123 + ix = fActiveLimit; 1.4124 + fHitEnd = TRUE; 1.4125 + } else { 1.4126 + // NOT DOT ALL mode. Line endings do not match '.' 1.4127 + // Scan forward until a line ending or end of input. 1.4128 + ix = fp->fInputIdx; 1.4129 + UTEXT_SETNATIVEINDEX(fInputText, ix); 1.4130 + for (;;) { 1.4131 + if (ix >= fActiveLimit) { 1.4132 + fHitEnd = TRUE; 1.4133 + break; 1.4134 + } 1.4135 + UChar32 c = UTEXT_NEXT32(fInputText); 1.4136 + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 1.4137 + if ((c == 0x0a) || // 0x0a is newline in both modes. 1.4138 + (((opValue & 2) == 0) && // IF not UNIX_LINES mode 1.4139 + (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 1.4140 + // char is a line ending. Exit the scanning loop. 1.4141 + break; 1.4142 + } 1.4143 + } 1.4144 + ix = UTEXT_GETNATIVEINDEX(fInputText); 1.4145 + } 1.4146 + } 1.4147 + 1.4148 + // If there were no matching characters, skip over the loop altogether. 1.4149 + // The loop doesn't run at all, a * op always succeeds. 1.4150 + if (ix == fp->fInputIdx) { 1.4151 + fp->fPatIdx++; // skip the URX_LOOP_C op. 1.4152 + break; 1.4153 + } 1.4154 + 1.4155 + // Peek ahead in the compiled pattern, to the URX_LOOP_C that 1.4156 + // must follow. It's operand is the stack location 1.4157 + // that holds the starting input index for the match of this .* 1.4158 + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 1.4159 + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 1.4160 + int32_t stackLoc = URX_VAL(loopcOp); 1.4161 + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 1.4162 + fp->fExtra[stackLoc] = fp->fInputIdx; 1.4163 + fp->fInputIdx = ix; 1.4164 + 1.4165 + // Save State to the URX_LOOP_C op that follows this one, 1.4166 + // so that match failures in the following code will return to there. 1.4167 + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 1.4168 + fp = StateSave(fp, fp->fPatIdx, status); 1.4169 + fp->fPatIdx++; 1.4170 + } 1.4171 + break; 1.4172 + 1.4173 + 1.4174 + case URX_LOOP_C: 1.4175 + { 1.4176 + U_ASSERT(opValue>=0 && opValue<fFrameSize); 1.4177 + backSearchIndex = fp->fExtra[opValue]; 1.4178 + U_ASSERT(backSearchIndex <= fp->fInputIdx); 1.4179 + if (backSearchIndex == fp->fInputIdx) { 1.4180 + // We've backed up the input idx to the point that the loop started. 1.4181 + // The loop is done. Leave here without saving state. 1.4182 + // Subsequent failures won't come back here. 1.4183 + break; 1.4184 + } 1.4185 + // Set up for the next iteration of the loop, with input index 1.4186 + // backed up by one from the last time through, 1.4187 + // and a state save to this instruction in case the following code fails again. 1.4188 + // (We're going backwards because this loop emulates stack unwinding, not 1.4189 + // the initial scan forward.) 1.4190 + U_ASSERT(fp->fInputIdx > 0); 1.4191 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.4192 + UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 1.4193 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.4194 + 1.4195 + UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 1.4196 + if (prevC == 0x0a && 1.4197 + fp->fInputIdx > backSearchIndex && 1.4198 + twoPrevC == 0x0d) { 1.4199 + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 1.4200 + if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 1.4201 + // .*, stepping back over CRLF pair. 1.4202 + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 1.4203 + } 1.4204 + } 1.4205 + 1.4206 + 1.4207 + fp = StateSave(fp, fp->fPatIdx-1, status); 1.4208 + } 1.4209 + break; 1.4210 + 1.4211 + 1.4212 + 1.4213 + default: 1.4214 + // Trouble. The compiled pattern contains an entry with an 1.4215 + // unrecognized type tag. 1.4216 + U_ASSERT(FALSE); 1.4217 + } 1.4218 + 1.4219 + if (U_FAILURE(status)) { 1.4220 + isMatch = FALSE; 1.4221 + break; 1.4222 + } 1.4223 + } 1.4224 + 1.4225 +breakFromLoop: 1.4226 + fMatch = isMatch; 1.4227 + if (isMatch) { 1.4228 + fLastMatchEnd = fMatchEnd; 1.4229 + fMatchStart = startIdx; 1.4230 + fMatchEnd = fp->fInputIdx; 1.4231 + if (fTraceDebug) { 1.4232 + REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); 1.4233 + } 1.4234 + } 1.4235 + else 1.4236 + { 1.4237 + if (fTraceDebug) { 1.4238 + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 1.4239 + } 1.4240 + } 1.4241 + 1.4242 + fFrame = fp; // The active stack frame when the engine stopped. 1.4243 + // Contains the capture group results that we need to 1.4244 + // access later. 1.4245 + return; 1.4246 +} 1.4247 + 1.4248 + 1.4249 +//-------------------------------------------------------------------------------- 1.4250 +// 1.4251 +// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 1.4252 +// assumption that the entire string is available in the UText's 1.4253 +// chunk buffer. For now, that means we can use int32_t indexes, 1.4254 +// except for anything that needs to be saved (like group starts 1.4255 +// and ends). 1.4256 +// 1.4257 +// startIdx: begin matching a this index. 1.4258 +// toEnd: if true, match must extend to end of the input region 1.4259 +// 1.4260 +//-------------------------------------------------------------------------------- 1.4261 +void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 1.4262 + UBool isMatch = FALSE; // True if the we have a match. 1.4263 + 1.4264 + int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 1.4265 + 1.4266 + int32_t op; // Operation from the compiled pattern, split into 1.4267 + int32_t opType; // the opcode 1.4268 + int32_t opValue; // and the operand value. 1.4269 + 1.4270 +#ifdef REGEX_RUN_DEBUG 1.4271 + if (fTraceDebug) 1.4272 + { 1.4273 + printf("MatchAt(startIdx=%d)\n", startIdx); 1.4274 + printf("Original Pattern: "); 1.4275 + UChar32 c = utext_next32From(fPattern->fPattern, 0); 1.4276 + while (c != U_SENTINEL) { 1.4277 + if (c<32 || c>256) { 1.4278 + c = '.'; 1.4279 + } 1.4280 + REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 1.4281 + 1.4282 + c = UTEXT_NEXT32(fPattern->fPattern); 1.4283 + } 1.4284 + printf("\n"); 1.4285 + printf("Input String: "); 1.4286 + c = utext_next32From(fInputText, 0); 1.4287 + while (c != U_SENTINEL) { 1.4288 + if (c<32 || c>256) { 1.4289 + c = '.'; 1.4290 + } 1.4291 + printf("%c", c); 1.4292 + 1.4293 + c = UTEXT_NEXT32(fInputText); 1.4294 + } 1.4295 + printf("\n"); 1.4296 + printf("\n"); 1.4297 + } 1.4298 +#endif 1.4299 + 1.4300 + if (U_FAILURE(status)) { 1.4301 + return; 1.4302 + } 1.4303 + 1.4304 + // Cache frequently referenced items from the compiled pattern 1.4305 + // 1.4306 + int64_t *pat = fPattern->fCompiledPat->getBuffer(); 1.4307 + 1.4308 + const UChar *litText = fPattern->fLiteralText.getBuffer(); 1.4309 + UVector *sets = fPattern->fSets; 1.4310 + 1.4311 + const UChar *inputBuf = fInputText->chunkContents; 1.4312 + 1.4313 + fFrameSize = fPattern->fFrameSize; 1.4314 + REStackFrame *fp = resetStack(); 1.4315 + 1.4316 + fp->fPatIdx = 0; 1.4317 + fp->fInputIdx = startIdx; 1.4318 + 1.4319 + // Zero out the pattern's static data 1.4320 + int32_t i; 1.4321 + for (i = 0; i<fPattern->fDataSize; i++) { 1.4322 + fData[i] = 0; 1.4323 + } 1.4324 + 1.4325 + // 1.4326 + // Main loop for interpreting the compiled pattern. 1.4327 + // One iteration of the loop per pattern operation performed. 1.4328 + // 1.4329 + for (;;) { 1.4330 +#if 0 1.4331 + if (_heapchk() != _HEAPOK) { 1.4332 + fprintf(stderr, "Heap Trouble\n"); 1.4333 + } 1.4334 +#endif 1.4335 + 1.4336 + op = (int32_t)pat[fp->fPatIdx]; 1.4337 + opType = URX_TYPE(op); 1.4338 + opValue = URX_VAL(op); 1.4339 +#ifdef REGEX_RUN_DEBUG 1.4340 + if (fTraceDebug) { 1.4341 + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 1.4342 + printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 1.4343 + UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 1.4344 + fPattern->dumpOp(fp->fPatIdx); 1.4345 + } 1.4346 +#endif 1.4347 + fp->fPatIdx++; 1.4348 + 1.4349 + switch (opType) { 1.4350 + 1.4351 + 1.4352 + case URX_NOP: 1.4353 + break; 1.4354 + 1.4355 + 1.4356 + case URX_BACKTRACK: 1.4357 + // Force a backtrack. In some circumstances, the pattern compiler 1.4358 + // will notice that the pattern can't possibly match anything, and will 1.4359 + // emit one of these at that point. 1.4360 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4361 + break; 1.4362 + 1.4363 + 1.4364 + case URX_ONECHAR: 1.4365 + if (fp->fInputIdx < fActiveLimit) { 1.4366 + UChar32 c; 1.4367 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4368 + if (c == opValue) { 1.4369 + break; 1.4370 + } 1.4371 + } else { 1.4372 + fHitEnd = TRUE; 1.4373 + } 1.4374 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4375 + break; 1.4376 + 1.4377 + 1.4378 + case URX_STRING: 1.4379 + { 1.4380 + // Test input against a literal string. 1.4381 + // Strings require two slots in the compiled pattern, one for the 1.4382 + // offset to the string text, and one for the length. 1.4383 + int32_t stringStartIdx = opValue; 1.4384 + int32_t stringLen; 1.4385 + 1.4386 + op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 1.4387 + fp->fPatIdx++; 1.4388 + opType = URX_TYPE(op); 1.4389 + stringLen = URX_VAL(op); 1.4390 + U_ASSERT(opType == URX_STRING_LEN); 1.4391 + U_ASSERT(stringLen >= 2); 1.4392 + 1.4393 + const UChar * pInp = inputBuf + fp->fInputIdx; 1.4394 + const UChar * pInpLimit = inputBuf + fActiveLimit; 1.4395 + const UChar * pPat = litText+stringStartIdx; 1.4396 + const UChar * pEnd = pInp + stringLen; 1.4397 + UBool success = TRUE; 1.4398 + while (pInp < pEnd) { 1.4399 + if (pInp >= pInpLimit) { 1.4400 + fHitEnd = TRUE; 1.4401 + success = FALSE; 1.4402 + break; 1.4403 + } 1.4404 + if (*pInp++ != *pPat++) { 1.4405 + success = FALSE; 1.4406 + break; 1.4407 + } 1.4408 + } 1.4409 + 1.4410 + if (success) { 1.4411 + fp->fInputIdx += stringLen; 1.4412 + } else { 1.4413 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4414 + } 1.4415 + } 1.4416 + break; 1.4417 + 1.4418 + 1.4419 + case URX_STATE_SAVE: 1.4420 + fp = StateSave(fp, opValue, status); 1.4421 + break; 1.4422 + 1.4423 + 1.4424 + case URX_END: 1.4425 + // The match loop will exit via this path on a successful match, 1.4426 + // when we reach the end of the pattern. 1.4427 + if (toEnd && fp->fInputIdx != fActiveLimit) { 1.4428 + // The pattern matched, but not to the end of input. Try some more. 1.4429 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4430 + break; 1.4431 + } 1.4432 + isMatch = TRUE; 1.4433 + goto breakFromLoop; 1.4434 + 1.4435 + // Start and End Capture stack frame variables are laid out out like this: 1.4436 + // fp->fExtra[opValue] - The start of a completed capture group 1.4437 + // opValue+1 - The end of a completed capture group 1.4438 + // opValue+2 - the start of a capture group whose end 1.4439 + // has not yet been reached (and might not ever be). 1.4440 + case URX_START_CAPTURE: 1.4441 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1.4442 + fp->fExtra[opValue+2] = fp->fInputIdx; 1.4443 + break; 1.4444 + 1.4445 + 1.4446 + case URX_END_CAPTURE: 1.4447 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1.4448 + U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 1.4449 + fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 1.4450 + fp->fExtra[opValue+1] = fp->fInputIdx; // End position 1.4451 + U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 1.4452 + break; 1.4453 + 1.4454 + 1.4455 + case URX_DOLLAR: // $, test for End of line 1.4456 + // or for position before new line at end of input 1.4457 + if (fp->fInputIdx < fAnchorLimit-2) { 1.4458 + // We are no where near the end of input. Fail. 1.4459 + // This is the common case. Keep it first. 1.4460 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4461 + break; 1.4462 + } 1.4463 + if (fp->fInputIdx >= fAnchorLimit) { 1.4464 + // We really are at the end of input. Success. 1.4465 + fHitEnd = TRUE; 1.4466 + fRequireEnd = TRUE; 1.4467 + break; 1.4468 + } 1.4469 + 1.4470 + // If we are positioned just before a new-line that is located at the 1.4471 + // end of input, succeed. 1.4472 + if (fp->fInputIdx == fAnchorLimit-1) { 1.4473 + UChar32 c; 1.4474 + U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 1.4475 + 1.4476 + if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 1.4477 + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 1.4478 + // At new-line at end of input. Success 1.4479 + fHitEnd = TRUE; 1.4480 + fRequireEnd = TRUE; 1.4481 + break; 1.4482 + } 1.4483 + } 1.4484 + } else if (fp->fInputIdx == fAnchorLimit-2 && 1.4485 + inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 1.4486 + fHitEnd = TRUE; 1.4487 + fRequireEnd = TRUE; 1.4488 + break; // At CR/LF at end of input. Success 1.4489 + } 1.4490 + 1.4491 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4492 + 1.4493 + break; 1.4494 + 1.4495 + 1.4496 + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 1.4497 + if (fp->fInputIdx >= fAnchorLimit-1) { 1.4498 + // Either at the last character of input, or off the end. 1.4499 + if (fp->fInputIdx == fAnchorLimit-1) { 1.4500 + // At last char of input. Success if it's a new line. 1.4501 + if (inputBuf[fp->fInputIdx] == 0x0a) { 1.4502 + fHitEnd = TRUE; 1.4503 + fRequireEnd = TRUE; 1.4504 + break; 1.4505 + } 1.4506 + } else { 1.4507 + // Off the end of input. Success. 1.4508 + fHitEnd = TRUE; 1.4509 + fRequireEnd = TRUE; 1.4510 + break; 1.4511 + } 1.4512 + } 1.4513 + 1.4514 + // Not at end of input. Back-track out. 1.4515 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4516 + break; 1.4517 + 1.4518 + 1.4519 + case URX_DOLLAR_M: // $, test for End of line in multi-line mode 1.4520 + { 1.4521 + if (fp->fInputIdx >= fAnchorLimit) { 1.4522 + // We really are at the end of input. Success. 1.4523 + fHitEnd = TRUE; 1.4524 + fRequireEnd = TRUE; 1.4525 + break; 1.4526 + } 1.4527 + // If we are positioned just before a new-line, succeed. 1.4528 + // It makes no difference where the new-line is within the input. 1.4529 + UChar32 c = inputBuf[fp->fInputIdx]; 1.4530 + if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 1.4531 + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 1.4532 + // In multi-line mode, hitting a new-line just before the end of input does not 1.4533 + // set the hitEnd or requireEnd flags 1.4534 + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 1.4535 + break; 1.4536 + } 1.4537 + } 1.4538 + // not at a new line. Fail. 1.4539 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4540 + } 1.4541 + break; 1.4542 + 1.4543 + 1.4544 + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 1.4545 + { 1.4546 + if (fp->fInputIdx >= fAnchorLimit) { 1.4547 + // We really are at the end of input. Success. 1.4548 + fHitEnd = TRUE; 1.4549 + fRequireEnd = TRUE; // Java set requireEnd in this case, even though 1.4550 + break; // adding a new-line would not lose the match. 1.4551 + } 1.4552 + // If we are not positioned just before a new-line, the test fails; backtrack out. 1.4553 + // It makes no difference where the new-line is within the input. 1.4554 + if (inputBuf[fp->fInputIdx] != 0x0a) { 1.4555 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4556 + } 1.4557 + } 1.4558 + break; 1.4559 + 1.4560 + 1.4561 + case URX_CARET: // ^, test for start of line 1.4562 + if (fp->fInputIdx != fAnchorStart) { 1.4563 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4564 + } 1.4565 + break; 1.4566 + 1.4567 + 1.4568 + case URX_CARET_M: // ^, test for start of line in mulit-line mode 1.4569 + { 1.4570 + if (fp->fInputIdx == fAnchorStart) { 1.4571 + // We are at the start input. Success. 1.4572 + break; 1.4573 + } 1.4574 + // Check whether character just before the current pos is a new-line 1.4575 + // unless we are at the end of input 1.4576 + UChar c = inputBuf[fp->fInputIdx - 1]; 1.4577 + if ((fp->fInputIdx < fAnchorLimit) && 1.4578 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 1.4579 + // It's a new-line. ^ is true. Success. 1.4580 + // TODO: what should be done with positions between a CR and LF? 1.4581 + break; 1.4582 + } 1.4583 + // Not at the start of a line. Fail. 1.4584 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4585 + } 1.4586 + break; 1.4587 + 1.4588 + 1.4589 + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 1.4590 + { 1.4591 + U_ASSERT(fp->fInputIdx >= fAnchorStart); 1.4592 + if (fp->fInputIdx <= fAnchorStart) { 1.4593 + // We are at the start input. Success. 1.4594 + break; 1.4595 + } 1.4596 + // Check whether character just before the current pos is a new-line 1.4597 + U_ASSERT(fp->fInputIdx <= fAnchorLimit); 1.4598 + UChar c = inputBuf[fp->fInputIdx - 1]; 1.4599 + if (c != 0x0a) { 1.4600 + // Not at the start of a line. Back-track out. 1.4601 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4602 + } 1.4603 + } 1.4604 + break; 1.4605 + 1.4606 + case URX_BACKSLASH_B: // Test for word boundaries 1.4607 + { 1.4608 + UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 1.4609 + success ^= (UBool)(opValue != 0); // flip sense for \B 1.4610 + if (!success) { 1.4611 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4612 + } 1.4613 + } 1.4614 + break; 1.4615 + 1.4616 + 1.4617 + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 1.4618 + { 1.4619 + UBool success = isUWordBoundary(fp->fInputIdx); 1.4620 + success ^= (UBool)(opValue != 0); // flip sense for \B 1.4621 + if (!success) { 1.4622 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4623 + } 1.4624 + } 1.4625 + break; 1.4626 + 1.4627 + 1.4628 + case URX_BACKSLASH_D: // Test for decimal digit 1.4629 + { 1.4630 + if (fp->fInputIdx >= fActiveLimit) { 1.4631 + fHitEnd = TRUE; 1.4632 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4633 + break; 1.4634 + } 1.4635 + 1.4636 + UChar32 c; 1.4637 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4638 + int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 1.4639 + UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 1.4640 + success ^= (UBool)(opValue != 0); // flip sense for \D 1.4641 + if (!success) { 1.4642 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4643 + } 1.4644 + } 1.4645 + break; 1.4646 + 1.4647 + 1.4648 + case URX_BACKSLASH_G: // Test for position at end of previous match 1.4649 + if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 1.4650 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4651 + } 1.4652 + break; 1.4653 + 1.4654 + 1.4655 + case URX_BACKSLASH_X: 1.4656 + // Match a Grapheme, as defined by Unicode TR 29. 1.4657 + // Differs slightly from Perl, which consumes combining marks independently 1.4658 + // of context. 1.4659 + { 1.4660 + 1.4661 + // Fail if at end of input 1.4662 + if (fp->fInputIdx >= fActiveLimit) { 1.4663 + fHitEnd = TRUE; 1.4664 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4665 + break; 1.4666 + } 1.4667 + 1.4668 + // Examine (and consume) the current char. 1.4669 + // Dispatch into a little state machine, based on the char. 1.4670 + UChar32 c; 1.4671 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4672 + UnicodeSet **sets = fPattern->fStaticSets; 1.4673 + if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 1.4674 + if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 1.4675 + if (sets[URX_GC_L]->contains(c)) goto GC_L; 1.4676 + if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1.4677 + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1.4678 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.4679 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.4680 + goto GC_Extend; 1.4681 + 1.4682 + 1.4683 + 1.4684 +GC_L: 1.4685 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.4686 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4687 + if (sets[URX_GC_L]->contains(c)) goto GC_L; 1.4688 + if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1.4689 + if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1.4690 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.4691 + U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1.4692 + goto GC_Extend; 1.4693 + 1.4694 +GC_V: 1.4695 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.4696 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4697 + if (sets[URX_GC_V]->contains(c)) goto GC_V; 1.4698 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.4699 + U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1.4700 + goto GC_Extend; 1.4701 + 1.4702 +GC_T: 1.4703 + if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1.4704 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4705 + if (sets[URX_GC_T]->contains(c)) goto GC_T; 1.4706 + U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1.4707 + goto GC_Extend; 1.4708 + 1.4709 +GC_Extend: 1.4710 + // Combining characters are consumed here 1.4711 + for (;;) { 1.4712 + if (fp->fInputIdx >= fActiveLimit) { 1.4713 + break; 1.4714 + } 1.4715 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4716 + if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 1.4717 + U16_BACK_1(inputBuf, 0, fp->fInputIdx); 1.4718 + break; 1.4719 + } 1.4720 + } 1.4721 + goto GC_Done; 1.4722 + 1.4723 +GC_Control: 1.4724 + // Most control chars stand alone (don't combine with combining chars), 1.4725 + // except for that CR/LF sequence is a single grapheme cluster. 1.4726 + if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 1.4727 + fp->fInputIdx++; 1.4728 + } 1.4729 + 1.4730 +GC_Done: 1.4731 + if (fp->fInputIdx >= fActiveLimit) { 1.4732 + fHitEnd = TRUE; 1.4733 + } 1.4734 + break; 1.4735 + } 1.4736 + 1.4737 + 1.4738 + 1.4739 + 1.4740 + case URX_BACKSLASH_Z: // Test for end of Input 1.4741 + if (fp->fInputIdx < fAnchorLimit) { 1.4742 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4743 + } else { 1.4744 + fHitEnd = TRUE; 1.4745 + fRequireEnd = TRUE; 1.4746 + } 1.4747 + break; 1.4748 + 1.4749 + 1.4750 + 1.4751 + case URX_STATIC_SETREF: 1.4752 + { 1.4753 + // Test input character against one of the predefined sets 1.4754 + // (Word Characters, for example) 1.4755 + // The high bit of the op value is a flag for the match polarity. 1.4756 + // 0: success if input char is in set. 1.4757 + // 1: success if input char is not in set. 1.4758 + if (fp->fInputIdx >= fActiveLimit) { 1.4759 + fHitEnd = TRUE; 1.4760 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4761 + break; 1.4762 + } 1.4763 + 1.4764 + UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 1.4765 + opValue &= ~URX_NEG_SET; 1.4766 + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1.4767 + 1.4768 + UChar32 c; 1.4769 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4770 + if (c < 256) { 1.4771 + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1.4772 + if (s8->contains(c)) { 1.4773 + success = !success; 1.4774 + } 1.4775 + } else { 1.4776 + const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1.4777 + if (s->contains(c)) { 1.4778 + success = !success; 1.4779 + } 1.4780 + } 1.4781 + if (!success) { 1.4782 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4783 + } 1.4784 + } 1.4785 + break; 1.4786 + 1.4787 + 1.4788 + case URX_STAT_SETREF_N: 1.4789 + { 1.4790 + // Test input character for NOT being a member of one of 1.4791 + // the predefined sets (Word Characters, for example) 1.4792 + if (fp->fInputIdx >= fActiveLimit) { 1.4793 + fHitEnd = TRUE; 1.4794 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4795 + break; 1.4796 + } 1.4797 + 1.4798 + U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1.4799 + 1.4800 + UChar32 c; 1.4801 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4802 + if (c < 256) { 1.4803 + Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1.4804 + if (s8->contains(c) == FALSE) { 1.4805 + break; 1.4806 + } 1.4807 + } else { 1.4808 + const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1.4809 + if (s->contains(c) == FALSE) { 1.4810 + break; 1.4811 + } 1.4812 + } 1.4813 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4814 + } 1.4815 + break; 1.4816 + 1.4817 + 1.4818 + case URX_SETREF: 1.4819 + { 1.4820 + if (fp->fInputIdx >= fActiveLimit) { 1.4821 + fHitEnd = TRUE; 1.4822 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4823 + break; 1.4824 + } 1.4825 + 1.4826 + U_ASSERT(opValue > 0 && opValue < sets->size()); 1.4827 + 1.4828 + // There is input left. Pick up one char and test it for set membership. 1.4829 + UChar32 c; 1.4830 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4831 + if (c<256) { 1.4832 + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 1.4833 + if (s8->contains(c)) { 1.4834 + // The character is in the set. A Match. 1.4835 + break; 1.4836 + } 1.4837 + } else { 1.4838 + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 1.4839 + if (s->contains(c)) { 1.4840 + // The character is in the set. A Match. 1.4841 + break; 1.4842 + } 1.4843 + } 1.4844 + 1.4845 + // the character wasn't in the set. 1.4846 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4847 + } 1.4848 + break; 1.4849 + 1.4850 + 1.4851 + case URX_DOTANY: 1.4852 + { 1.4853 + // . matches anything, but stops at end-of-line. 1.4854 + if (fp->fInputIdx >= fActiveLimit) { 1.4855 + // At end of input. Match failed. Backtrack out. 1.4856 + fHitEnd = TRUE; 1.4857 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4858 + break; 1.4859 + } 1.4860 + 1.4861 + // There is input left. Advance over one char, unless we've hit end-of-line 1.4862 + UChar32 c; 1.4863 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4864 + if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1.4865 + ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 1.4866 + // End of line in normal mode. . does not match. 1.4867 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4868 + break; 1.4869 + } 1.4870 + } 1.4871 + break; 1.4872 + 1.4873 + 1.4874 + case URX_DOTANY_ALL: 1.4875 + { 1.4876 + // . in dot-matches-all (including new lines) mode 1.4877 + if (fp->fInputIdx >= fActiveLimit) { 1.4878 + // At end of input. Match failed. Backtrack out. 1.4879 + fHitEnd = TRUE; 1.4880 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4881 + break; 1.4882 + } 1.4883 + 1.4884 + // There is input left. Advance over one char, except if we are 1.4885 + // at a cr/lf, advance over both of them. 1.4886 + UChar32 c; 1.4887 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4888 + if (c==0x0d && fp->fInputIdx < fActiveLimit) { 1.4889 + // In the case of a CR/LF, we need to advance over both. 1.4890 + if (inputBuf[fp->fInputIdx] == 0x0a) { 1.4891 + U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 1.4892 + } 1.4893 + } 1.4894 + } 1.4895 + break; 1.4896 + 1.4897 + 1.4898 + case URX_DOTANY_UNIX: 1.4899 + { 1.4900 + // '.' operator, matches all, but stops at end-of-line. 1.4901 + // UNIX_LINES mode, so 0x0a is the only recognized line ending. 1.4902 + if (fp->fInputIdx >= fActiveLimit) { 1.4903 + // At end of input. Match failed. Backtrack out. 1.4904 + fHitEnd = TRUE; 1.4905 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4906 + break; 1.4907 + } 1.4908 + 1.4909 + // There is input left. Advance over one char, unless we've hit end-of-line 1.4910 + UChar32 c; 1.4911 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.4912 + if (c == 0x0a) { 1.4913 + // End of line in normal mode. '.' does not match the \n 1.4914 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4915 + } 1.4916 + } 1.4917 + break; 1.4918 + 1.4919 + 1.4920 + case URX_JMP: 1.4921 + fp->fPatIdx = opValue; 1.4922 + break; 1.4923 + 1.4924 + case URX_FAIL: 1.4925 + isMatch = FALSE; 1.4926 + goto breakFromLoop; 1.4927 + 1.4928 + case URX_JMP_SAV: 1.4929 + U_ASSERT(opValue < fPattern->fCompiledPat->size()); 1.4930 + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 1.4931 + fp->fPatIdx = opValue; // Then JMP. 1.4932 + break; 1.4933 + 1.4934 + case URX_JMP_SAV_X: 1.4935 + // This opcode is used with (x)+, when x can match a zero length string. 1.4936 + // Same as JMP_SAV, except conditional on the match having made forward progress. 1.4937 + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 1.4938 + // data address of the input position at the start of the loop. 1.4939 + { 1.4940 + U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 1.4941 + int32_t stoOp = (int32_t)pat[opValue-1]; 1.4942 + U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 1.4943 + int32_t frameLoc = URX_VAL(stoOp); 1.4944 + U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 1.4945 + int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 1.4946 + U_ASSERT(prevInputIdx <= fp->fInputIdx); 1.4947 + if (prevInputIdx < fp->fInputIdx) { 1.4948 + // The match did make progress. Repeat the loop. 1.4949 + fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 1.4950 + fp->fPatIdx = opValue; 1.4951 + fp->fExtra[frameLoc] = fp->fInputIdx; 1.4952 + } 1.4953 + // If the input position did not advance, we do nothing here, 1.4954 + // execution will fall out of the loop. 1.4955 + } 1.4956 + break; 1.4957 + 1.4958 + case URX_CTR_INIT: 1.4959 + { 1.4960 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 1.4961 + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 1.4962 + 1.4963 + // Pick up the three extra operands that CTR_INIT has, and 1.4964 + // skip the pattern location counter past 1.4965 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.4966 + fp->fPatIdx += 3; 1.4967 + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 1.4968 + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 1.4969 + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 1.4970 + U_ASSERT(minCount>=0); 1.4971 + U_ASSERT(maxCount>=minCount || maxCount==-1); 1.4972 + U_ASSERT(loopLoc>=fp->fPatIdx); 1.4973 + 1.4974 + if (minCount == 0) { 1.4975 + fp = StateSave(fp, loopLoc+1, status); 1.4976 + } 1.4977 + if (maxCount == -1) { 1.4978 + fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 1.4979 + } else if (maxCount == 0) { 1.4980 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.4981 + } 1.4982 + } 1.4983 + break; 1.4984 + 1.4985 + case URX_CTR_LOOP: 1.4986 + { 1.4987 + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 1.4988 + int32_t initOp = (int32_t)pat[opValue]; 1.4989 + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 1.4990 + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 1.4991 + int32_t minCount = (int32_t)pat[opValue+2]; 1.4992 + int32_t maxCount = (int32_t)pat[opValue+3]; 1.4993 + (*pCounter)++; 1.4994 + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 1.4995 + U_ASSERT(*pCounter == maxCount); 1.4996 + break; 1.4997 + } 1.4998 + if (*pCounter >= minCount) { 1.4999 + if (maxCount == -1) { 1.5000 + // Loop has no hard upper bound. 1.5001 + // Check that it is progressing through the input, break if it is not. 1.5002 + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 1.5003 + if (fp->fInputIdx == *pLastInputIdx) { 1.5004 + break; 1.5005 + } else { 1.5006 + *pLastInputIdx = fp->fInputIdx; 1.5007 + } 1.5008 + } 1.5009 + fp = StateSave(fp, fp->fPatIdx, status); 1.5010 + } 1.5011 + fp->fPatIdx = opValue + 4; // Loop back. 1.5012 + } 1.5013 + break; 1.5014 + 1.5015 + case URX_CTR_INIT_NG: 1.5016 + { 1.5017 + // Initialize a non-greedy loop 1.5018 + U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 1.5019 + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 1.5020 + 1.5021 + // Pick up the three extra operands that CTR_INIT_NG has, and 1.5022 + // skip the pattern location counter past 1.5023 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.5024 + fp->fPatIdx += 3; 1.5025 + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 1.5026 + int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 1.5027 + int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 1.5028 + U_ASSERT(minCount>=0); 1.5029 + U_ASSERT(maxCount>=minCount || maxCount==-1); 1.5030 + U_ASSERT(loopLoc>fp->fPatIdx); 1.5031 + if (maxCount == -1) { 1.5032 + fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 1.5033 + } 1.5034 + 1.5035 + if (minCount == 0) { 1.5036 + if (maxCount != 0) { 1.5037 + fp = StateSave(fp, fp->fPatIdx, status); 1.5038 + } 1.5039 + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 1.5040 + } 1.5041 + } 1.5042 + break; 1.5043 + 1.5044 + case URX_CTR_LOOP_NG: 1.5045 + { 1.5046 + // Non-greedy {min, max} loops 1.5047 + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 1.5048 + int32_t initOp = (int32_t)pat[opValue]; 1.5049 + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 1.5050 + int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 1.5051 + int32_t minCount = (int32_t)pat[opValue+2]; 1.5052 + int32_t maxCount = (int32_t)pat[opValue+3]; 1.5053 + 1.5054 + (*pCounter)++; 1.5055 + if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 1.5056 + // The loop has matched the maximum permitted number of times. 1.5057 + // Break out of here with no action. Matching will 1.5058 + // continue with the following pattern. 1.5059 + U_ASSERT(*pCounter == maxCount); 1.5060 + break; 1.5061 + } 1.5062 + 1.5063 + if (*pCounter < minCount) { 1.5064 + // We haven't met the minimum number of matches yet. 1.5065 + // Loop back for another one. 1.5066 + fp->fPatIdx = opValue + 4; // Loop back. 1.5067 + } else { 1.5068 + // We do have the minimum number of matches. 1.5069 + 1.5070 + // If there is no upper bound on the loop iterations, check that the input index 1.5071 + // is progressing, and stop the loop if it is not. 1.5072 + if (maxCount == -1) { 1.5073 + int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 1.5074 + if (fp->fInputIdx == *pLastInputIdx) { 1.5075 + break; 1.5076 + } 1.5077 + *pLastInputIdx = fp->fInputIdx; 1.5078 + } 1.5079 + 1.5080 + // Loop Continuation: we will fall into the pattern following the loop 1.5081 + // (non-greedy, don't execute loop body first), but first do 1.5082 + // a state save to the top of the loop, so that a match failure 1.5083 + // in the following pattern will try another iteration of the loop. 1.5084 + fp = StateSave(fp, opValue + 4, status); 1.5085 + } 1.5086 + } 1.5087 + break; 1.5088 + 1.5089 + case URX_STO_SP: 1.5090 + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 1.5091 + fData[opValue] = fStack->size(); 1.5092 + break; 1.5093 + 1.5094 + case URX_LD_SP: 1.5095 + { 1.5096 + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 1.5097 + int32_t newStackSize = (int32_t)fData[opValue]; 1.5098 + U_ASSERT(newStackSize <= fStack->size()); 1.5099 + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 1.5100 + if (newFP == (int64_t *)fp) { 1.5101 + break; 1.5102 + } 1.5103 + int32_t i; 1.5104 + for (i=0; i<fFrameSize; i++) { 1.5105 + newFP[i] = ((int64_t *)fp)[i]; 1.5106 + } 1.5107 + fp = (REStackFrame *)newFP; 1.5108 + fStack->setSize(newStackSize); 1.5109 + } 1.5110 + break; 1.5111 + 1.5112 + case URX_BACKREF: 1.5113 + { 1.5114 + U_ASSERT(opValue < fFrameSize); 1.5115 + int64_t groupStartIdx = fp->fExtra[opValue]; 1.5116 + int64_t groupEndIdx = fp->fExtra[opValue+1]; 1.5117 + U_ASSERT(groupStartIdx <= groupEndIdx); 1.5118 + int64_t inputIndex = fp->fInputIdx; 1.5119 + if (groupStartIdx < 0) { 1.5120 + // This capture group has not participated in the match thus far, 1.5121 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 1.5122 + break; 1.5123 + } 1.5124 + UBool success = TRUE; 1.5125 + for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 1.5126 + if (inputIndex >= fActiveLimit) { 1.5127 + success = FALSE; 1.5128 + fHitEnd = TRUE; 1.5129 + break; 1.5130 + } 1.5131 + if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 1.5132 + success = FALSE; 1.5133 + break; 1.5134 + } 1.5135 + } 1.5136 + if (success) { 1.5137 + fp->fInputIdx = inputIndex; 1.5138 + } else { 1.5139 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5140 + } 1.5141 + } 1.5142 + break; 1.5143 + 1.5144 + case URX_BACKREF_I: 1.5145 + { 1.5146 + U_ASSERT(opValue < fFrameSize); 1.5147 + int64_t groupStartIdx = fp->fExtra[opValue]; 1.5148 + int64_t groupEndIdx = fp->fExtra[opValue+1]; 1.5149 + U_ASSERT(groupStartIdx <= groupEndIdx); 1.5150 + if (groupStartIdx < 0) { 1.5151 + // This capture group has not participated in the match thus far, 1.5152 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 1.5153 + break; 1.5154 + } 1.5155 + CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 1.5156 + CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 1.5157 + 1.5158 + // Note: if the capture group match was of an empty string the backref 1.5159 + // match succeeds. Verified by testing: Perl matches succeed 1.5160 + // in this case, so we do too. 1.5161 + 1.5162 + UBool success = TRUE; 1.5163 + for (;;) { 1.5164 + UChar32 captureGroupChar = captureGroupItr.next(); 1.5165 + if (captureGroupChar == U_SENTINEL) { 1.5166 + success = TRUE; 1.5167 + break; 1.5168 + } 1.5169 + UChar32 inputChar = inputItr.next(); 1.5170 + if (inputChar == U_SENTINEL) { 1.5171 + success = FALSE; 1.5172 + fHitEnd = TRUE; 1.5173 + break; 1.5174 + } 1.5175 + if (inputChar != captureGroupChar) { 1.5176 + success = FALSE; 1.5177 + break; 1.5178 + } 1.5179 + } 1.5180 + 1.5181 + if (success && inputItr.inExpansion()) { 1.5182 + // We otained a match by consuming part of a string obtained from 1.5183 + // case-folding a single code point of the input text. 1.5184 + // This does not count as an overall match. 1.5185 + success = FALSE; 1.5186 + } 1.5187 + 1.5188 + if (success) { 1.5189 + fp->fInputIdx = inputItr.getIndex(); 1.5190 + } else { 1.5191 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5192 + } 1.5193 + } 1.5194 + break; 1.5195 + 1.5196 + case URX_STO_INP_LOC: 1.5197 + { 1.5198 + U_ASSERT(opValue >= 0 && opValue < fFrameSize); 1.5199 + fp->fExtra[opValue] = fp->fInputIdx; 1.5200 + } 1.5201 + break; 1.5202 + 1.5203 + case URX_JMPX: 1.5204 + { 1.5205 + int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 1.5206 + fp->fPatIdx += 1; 1.5207 + int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 1.5208 + U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 1.5209 + int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 1.5210 + U_ASSERT(savedInputIdx <= fp->fInputIdx); 1.5211 + if (savedInputIdx < fp->fInputIdx) { 1.5212 + fp->fPatIdx = opValue; // JMP 1.5213 + } else { 1.5214 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 1.5215 + } 1.5216 + } 1.5217 + break; 1.5218 + 1.5219 + case URX_LA_START: 1.5220 + { 1.5221 + // Entering a lookahead block. 1.5222 + // Save Stack Ptr, Input Pos. 1.5223 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5224 + fData[opValue] = fStack->size(); 1.5225 + fData[opValue+1] = fp->fInputIdx; 1.5226 + fActiveStart = fLookStart; // Set the match region change for 1.5227 + fActiveLimit = fLookLimit; // transparent bounds. 1.5228 + } 1.5229 + break; 1.5230 + 1.5231 + case URX_LA_END: 1.5232 + { 1.5233 + // Leaving a look-ahead block. 1.5234 + // restore Stack Ptr, Input Pos to positions they had on entry to block. 1.5235 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5236 + int32_t stackSize = fStack->size(); 1.5237 + int32_t newStackSize = (int32_t)fData[opValue]; 1.5238 + U_ASSERT(stackSize >= newStackSize); 1.5239 + if (stackSize > newStackSize) { 1.5240 + // Copy the current top frame back to the new (cut back) top frame. 1.5241 + // This makes the capture groups from within the look-ahead 1.5242 + // expression available. 1.5243 + int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 1.5244 + int32_t i; 1.5245 + for (i=0; i<fFrameSize; i++) { 1.5246 + newFP[i] = ((int64_t *)fp)[i]; 1.5247 + } 1.5248 + fp = (REStackFrame *)newFP; 1.5249 + fStack->setSize(newStackSize); 1.5250 + } 1.5251 + fp->fInputIdx = fData[opValue+1]; 1.5252 + 1.5253 + // Restore the active region bounds in the input string; they may have 1.5254 + // been changed because of transparent bounds on a Region. 1.5255 + fActiveStart = fRegionStart; 1.5256 + fActiveLimit = fRegionLimit; 1.5257 + } 1.5258 + break; 1.5259 + 1.5260 + case URX_ONECHAR_I: 1.5261 + if (fp->fInputIdx < fActiveLimit) { 1.5262 + UChar32 c; 1.5263 + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1.5264 + if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 1.5265 + break; 1.5266 + } 1.5267 + } else { 1.5268 + fHitEnd = TRUE; 1.5269 + } 1.5270 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5271 + break; 1.5272 + 1.5273 + case URX_STRING_I: 1.5274 + // Case-insensitive test input against a literal string. 1.5275 + // Strings require two slots in the compiled pattern, one for the 1.5276 + // offset to the string text, and one for the length. 1.5277 + // The compiled string has already been case folded. 1.5278 + { 1.5279 + const UChar *patternString = litText + opValue; 1.5280 + 1.5281 + op = (int32_t)pat[fp->fPatIdx]; 1.5282 + fp->fPatIdx++; 1.5283 + opType = URX_TYPE(op); 1.5284 + opValue = URX_VAL(op); 1.5285 + U_ASSERT(opType == URX_STRING_LEN); 1.5286 + int32_t patternStringLen = opValue; // Length of the string from the pattern. 1.5287 + 1.5288 + UChar32 cText; 1.5289 + UChar32 cPattern; 1.5290 + UBool success = TRUE; 1.5291 + int32_t patternStringIdx = 0; 1.5292 + CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 1.5293 + while (patternStringIdx < patternStringLen) { 1.5294 + U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 1.5295 + cText = inputIterator.next(); 1.5296 + if (cText != cPattern) { 1.5297 + success = FALSE; 1.5298 + if (cText == U_SENTINEL) { 1.5299 + fHitEnd = TRUE; 1.5300 + } 1.5301 + break; 1.5302 + } 1.5303 + } 1.5304 + if (inputIterator.inExpansion()) { 1.5305 + success = FALSE; 1.5306 + } 1.5307 + 1.5308 + if (success) { 1.5309 + fp->fInputIdx = inputIterator.getIndex(); 1.5310 + } else { 1.5311 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5312 + } 1.5313 + } 1.5314 + break; 1.5315 + 1.5316 + case URX_LB_START: 1.5317 + { 1.5318 + // Entering a look-behind block. 1.5319 + // Save Stack Ptr, Input Pos. 1.5320 + // TODO: implement transparent bounds. Ticket #6067 1.5321 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5322 + fData[opValue] = fStack->size(); 1.5323 + fData[opValue+1] = fp->fInputIdx; 1.5324 + // Init the variable containing the start index for attempted matches. 1.5325 + fData[opValue+2] = -1; 1.5326 + // Save input string length, then reset to pin any matches to end at 1.5327 + // the current position. 1.5328 + fData[opValue+3] = fActiveLimit; 1.5329 + fActiveLimit = fp->fInputIdx; 1.5330 + } 1.5331 + break; 1.5332 + 1.5333 + 1.5334 + case URX_LB_CONT: 1.5335 + { 1.5336 + // Positive Look-Behind, at top of loop checking for matches of LB expression 1.5337 + // at all possible input starting positions. 1.5338 + 1.5339 + // Fetch the min and max possible match lengths. They are the operands 1.5340 + // of this op in the pattern. 1.5341 + int32_t minML = (int32_t)pat[fp->fPatIdx++]; 1.5342 + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 1.5343 + U_ASSERT(minML <= maxML); 1.5344 + U_ASSERT(minML >= 0); 1.5345 + 1.5346 + // Fetch (from data) the last input index where a match was attempted. 1.5347 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5348 + int64_t *lbStartIdx = &fData[opValue+2]; 1.5349 + if (*lbStartIdx < 0) { 1.5350 + // First time through loop. 1.5351 + *lbStartIdx = fp->fInputIdx - minML; 1.5352 + } else { 1.5353 + // 2nd through nth time through the loop. 1.5354 + // Back up start position for match by one. 1.5355 + if (*lbStartIdx == 0) { 1.5356 + (*lbStartIdx)--; 1.5357 + } else { 1.5358 + U16_BACK_1(inputBuf, 0, *lbStartIdx); 1.5359 + } 1.5360 + } 1.5361 + 1.5362 + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 1.5363 + // We have tried all potential match starting points without 1.5364 + // getting a match. Backtrack out, and out of the 1.5365 + // Look Behind altogether. 1.5366 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5367 + int64_t restoreInputLen = fData[opValue+3]; 1.5368 + U_ASSERT(restoreInputLen >= fActiveLimit); 1.5369 + U_ASSERT(restoreInputLen <= fInputLength); 1.5370 + fActiveLimit = restoreInputLen; 1.5371 + break; 1.5372 + } 1.5373 + 1.5374 + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 1.5375 + // (successful match will fall off the end of the loop.) 1.5376 + fp = StateSave(fp, fp->fPatIdx-3, status); 1.5377 + fp->fInputIdx = *lbStartIdx; 1.5378 + } 1.5379 + break; 1.5380 + 1.5381 + case URX_LB_END: 1.5382 + // End of a look-behind block, after a successful match. 1.5383 + { 1.5384 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5385 + if (fp->fInputIdx != fActiveLimit) { 1.5386 + // The look-behind expression matched, but the match did not 1.5387 + // extend all the way to the point that we are looking behind from. 1.5388 + // FAIL out of here, which will take us back to the LB_CONT, which 1.5389 + // will retry the match starting at another position or fail 1.5390 + // the look-behind altogether, whichever is appropriate. 1.5391 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5392 + break; 1.5393 + } 1.5394 + 1.5395 + // Look-behind match is good. Restore the orignal input string length, 1.5396 + // which had been truncated to pin the end of the lookbehind match to the 1.5397 + // position being looked-behind. 1.5398 + int64_t originalInputLen = fData[opValue+3]; 1.5399 + U_ASSERT(originalInputLen >= fActiveLimit); 1.5400 + U_ASSERT(originalInputLen <= fInputLength); 1.5401 + fActiveLimit = originalInputLen; 1.5402 + } 1.5403 + break; 1.5404 + 1.5405 + 1.5406 + case URX_LBN_CONT: 1.5407 + { 1.5408 + // Negative Look-Behind, at top of loop checking for matches of LB expression 1.5409 + // at all possible input starting positions. 1.5410 + 1.5411 + // Fetch the extra parameters of this op. 1.5412 + int32_t minML = (int32_t)pat[fp->fPatIdx++]; 1.5413 + int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 1.5414 + int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 1.5415 + continueLoc = URX_VAL(continueLoc); 1.5416 + U_ASSERT(minML <= maxML); 1.5417 + U_ASSERT(minML >= 0); 1.5418 + U_ASSERT(continueLoc > fp->fPatIdx); 1.5419 + 1.5420 + // Fetch (from data) the last input index where a match was attempted. 1.5421 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5422 + int64_t *lbStartIdx = &fData[opValue+2]; 1.5423 + if (*lbStartIdx < 0) { 1.5424 + // First time through loop. 1.5425 + *lbStartIdx = fp->fInputIdx - minML; 1.5426 + } else { 1.5427 + // 2nd through nth time through the loop. 1.5428 + // Back up start position for match by one. 1.5429 + if (*lbStartIdx == 0) { 1.5430 + (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 1.5431 + } else { 1.5432 + U16_BACK_1(inputBuf, 0, *lbStartIdx); 1.5433 + } 1.5434 + } 1.5435 + 1.5436 + if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 1.5437 + // We have tried all potential match starting points without 1.5438 + // getting a match, which means that the negative lookbehind as 1.5439 + // a whole has succeeded. Jump forward to the continue location 1.5440 + int64_t restoreInputLen = fData[opValue+3]; 1.5441 + U_ASSERT(restoreInputLen >= fActiveLimit); 1.5442 + U_ASSERT(restoreInputLen <= fInputLength); 1.5443 + fActiveLimit = restoreInputLen; 1.5444 + fp->fPatIdx = continueLoc; 1.5445 + break; 1.5446 + } 1.5447 + 1.5448 + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 1.5449 + // (successful match will cause a FAIL out of the loop altogether.) 1.5450 + fp = StateSave(fp, fp->fPatIdx-4, status); 1.5451 + fp->fInputIdx = *lbStartIdx; 1.5452 + } 1.5453 + break; 1.5454 + 1.5455 + case URX_LBN_END: 1.5456 + // End of a negative look-behind block, after a successful match. 1.5457 + { 1.5458 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5459 + if (fp->fInputIdx != fActiveLimit) { 1.5460 + // The look-behind expression matched, but the match did not 1.5461 + // extend all the way to the point that we are looking behind from. 1.5462 + // FAIL out of here, which will take us back to the LB_CONT, which 1.5463 + // will retry the match starting at another position or succeed 1.5464 + // the look-behind altogether, whichever is appropriate. 1.5465 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5466 + break; 1.5467 + } 1.5468 + 1.5469 + // Look-behind expression matched, which means look-behind test as 1.5470 + // a whole Fails 1.5471 + 1.5472 + // Restore the orignal input string length, which had been truncated 1.5473 + // inorder to pin the end of the lookbehind match 1.5474 + // to the position being looked-behind. 1.5475 + int64_t originalInputLen = fData[opValue+3]; 1.5476 + U_ASSERT(originalInputLen >= fActiveLimit); 1.5477 + U_ASSERT(originalInputLen <= fInputLength); 1.5478 + fActiveLimit = originalInputLen; 1.5479 + 1.5480 + // Restore original stack position, discarding any state saved 1.5481 + // by the successful pattern match. 1.5482 + U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 1.5483 + int32_t newStackSize = (int32_t)fData[opValue]; 1.5484 + U_ASSERT(fStack->size() > newStackSize); 1.5485 + fStack->setSize(newStackSize); 1.5486 + 1.5487 + // FAIL, which will take control back to someplace 1.5488 + // prior to entering the look-behind test. 1.5489 + fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1.5490 + } 1.5491 + break; 1.5492 + 1.5493 + 1.5494 + case URX_LOOP_SR_I: 1.5495 + // Loop Initialization for the optimized implementation of 1.5496 + // [some character set]* 1.5497 + // This op scans through all matching input. 1.5498 + // The following LOOP_C op emulates stack unwinding if the following pattern fails. 1.5499 + { 1.5500 + U_ASSERT(opValue > 0 && opValue < sets->size()); 1.5501 + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 1.5502 + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 1.5503 + 1.5504 + // Loop through input, until either the input is exhausted or 1.5505 + // we reach a character that is not a member of the set. 1.5506 + int32_t ix = (int32_t)fp->fInputIdx; 1.5507 + for (;;) { 1.5508 + if (ix >= fActiveLimit) { 1.5509 + fHitEnd = TRUE; 1.5510 + break; 1.5511 + } 1.5512 + UChar32 c; 1.5513 + U16_NEXT(inputBuf, ix, fActiveLimit, c); 1.5514 + if (c<256) { 1.5515 + if (s8->contains(c) == FALSE) { 1.5516 + U16_BACK_1(inputBuf, 0, ix); 1.5517 + break; 1.5518 + } 1.5519 + } else { 1.5520 + if (s->contains(c) == FALSE) { 1.5521 + U16_BACK_1(inputBuf, 0, ix); 1.5522 + break; 1.5523 + } 1.5524 + } 1.5525 + } 1.5526 + 1.5527 + // If there were no matching characters, skip over the loop altogether. 1.5528 + // The loop doesn't run at all, a * op always succeeds. 1.5529 + if (ix == fp->fInputIdx) { 1.5530 + fp->fPatIdx++; // skip the URX_LOOP_C op. 1.5531 + break; 1.5532 + } 1.5533 + 1.5534 + // Peek ahead in the compiled pattern, to the URX_LOOP_C that 1.5535 + // must follow. It's operand is the stack location 1.5536 + // that holds the starting input index for the match of this [set]* 1.5537 + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 1.5538 + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 1.5539 + int32_t stackLoc = URX_VAL(loopcOp); 1.5540 + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 1.5541 + fp->fExtra[stackLoc] = fp->fInputIdx; 1.5542 + fp->fInputIdx = ix; 1.5543 + 1.5544 + // Save State to the URX_LOOP_C op that follows this one, 1.5545 + // so that match failures in the following code will return to there. 1.5546 + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 1.5547 + fp = StateSave(fp, fp->fPatIdx, status); 1.5548 + fp->fPatIdx++; 1.5549 + } 1.5550 + break; 1.5551 + 1.5552 + 1.5553 + case URX_LOOP_DOT_I: 1.5554 + // Loop Initialization for the optimized implementation of .* 1.5555 + // This op scans through all remaining input. 1.5556 + // The following LOOP_C op emulates stack unwinding if the following pattern fails. 1.5557 + { 1.5558 + // Loop through input until the input is exhausted (we reach an end-of-line) 1.5559 + // In DOTALL mode, we can just go straight to the end of the input. 1.5560 + int32_t ix; 1.5561 + if ((opValue & 1) == 1) { 1.5562 + // Dot-matches-All mode. Jump straight to the end of the string. 1.5563 + ix = (int32_t)fActiveLimit; 1.5564 + fHitEnd = TRUE; 1.5565 + } else { 1.5566 + // NOT DOT ALL mode. Line endings do not match '.' 1.5567 + // Scan forward until a line ending or end of input. 1.5568 + ix = (int32_t)fp->fInputIdx; 1.5569 + for (;;) { 1.5570 + if (ix >= fActiveLimit) { 1.5571 + fHitEnd = TRUE; 1.5572 + break; 1.5573 + } 1.5574 + UChar32 c; 1.5575 + U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 1.5576 + if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 1.5577 + if ((c == 0x0a) || // 0x0a is newline in both modes. 1.5578 + (((opValue & 2) == 0) && // IF not UNIX_LINES mode 1.5579 + ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 1.5580 + // char is a line ending. Put the input pos back to the 1.5581 + // line ending char, and exit the scanning loop. 1.5582 + U16_BACK_1(inputBuf, 0, ix); 1.5583 + break; 1.5584 + } 1.5585 + } 1.5586 + } 1.5587 + } 1.5588 + 1.5589 + // If there were no matching characters, skip over the loop altogether. 1.5590 + // The loop doesn't run at all, a * op always succeeds. 1.5591 + if (ix == fp->fInputIdx) { 1.5592 + fp->fPatIdx++; // skip the URX_LOOP_C op. 1.5593 + break; 1.5594 + } 1.5595 + 1.5596 + // Peek ahead in the compiled pattern, to the URX_LOOP_C that 1.5597 + // must follow. It's operand is the stack location 1.5598 + // that holds the starting input index for the match of this .* 1.5599 + int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 1.5600 + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 1.5601 + int32_t stackLoc = URX_VAL(loopcOp); 1.5602 + U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 1.5603 + fp->fExtra[stackLoc] = fp->fInputIdx; 1.5604 + fp->fInputIdx = ix; 1.5605 + 1.5606 + // Save State to the URX_LOOP_C op that follows this one, 1.5607 + // so that match failures in the following code will return to there. 1.5608 + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 1.5609 + fp = StateSave(fp, fp->fPatIdx, status); 1.5610 + fp->fPatIdx++; 1.5611 + } 1.5612 + break; 1.5613 + 1.5614 + 1.5615 + case URX_LOOP_C: 1.5616 + { 1.5617 + U_ASSERT(opValue>=0 && opValue<fFrameSize); 1.5618 + backSearchIndex = (int32_t)fp->fExtra[opValue]; 1.5619 + U_ASSERT(backSearchIndex <= fp->fInputIdx); 1.5620 + if (backSearchIndex == fp->fInputIdx) { 1.5621 + // We've backed up the input idx to the point that the loop started. 1.5622 + // The loop is done. Leave here without saving state. 1.5623 + // Subsequent failures won't come back here. 1.5624 + break; 1.5625 + } 1.5626 + // Set up for the next iteration of the loop, with input index 1.5627 + // backed up by one from the last time through, 1.5628 + // and a state save to this instruction in case the following code fails again. 1.5629 + // (We're going backwards because this loop emulates stack unwinding, not 1.5630 + // the initial scan forward.) 1.5631 + U_ASSERT(fp->fInputIdx > 0); 1.5632 + UChar32 prevC; 1.5633 + U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 1.5634 + 1.5635 + if (prevC == 0x0a && 1.5636 + fp->fInputIdx > backSearchIndex && 1.5637 + inputBuf[fp->fInputIdx-1] == 0x0d) { 1.5638 + int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 1.5639 + if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 1.5640 + // .*, stepping back over CRLF pair. 1.5641 + U16_BACK_1(inputBuf, 0, fp->fInputIdx); 1.5642 + } 1.5643 + } 1.5644 + 1.5645 + 1.5646 + fp = StateSave(fp, fp->fPatIdx-1, status); 1.5647 + } 1.5648 + break; 1.5649 + 1.5650 + 1.5651 + 1.5652 + default: 1.5653 + // Trouble. The compiled pattern contains an entry with an 1.5654 + // unrecognized type tag. 1.5655 + U_ASSERT(FALSE); 1.5656 + } 1.5657 + 1.5658 + if (U_FAILURE(status)) { 1.5659 + isMatch = FALSE; 1.5660 + break; 1.5661 + } 1.5662 + } 1.5663 + 1.5664 +breakFromLoop: 1.5665 + fMatch = isMatch; 1.5666 + if (isMatch) { 1.5667 + fLastMatchEnd = fMatchEnd; 1.5668 + fMatchStart = startIdx; 1.5669 + fMatchEnd = fp->fInputIdx; 1.5670 + if (fTraceDebug) { 1.5671 + REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); 1.5672 + } 1.5673 + } 1.5674 + else 1.5675 + { 1.5676 + if (fTraceDebug) { 1.5677 + REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 1.5678 + } 1.5679 + } 1.5680 + 1.5681 + fFrame = fp; // The active stack frame when the engine stopped. 1.5682 + // Contains the capture group results that we need to 1.5683 + // access later. 1.5684 + 1.5685 + return; 1.5686 +} 1.5687 + 1.5688 + 1.5689 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 1.5690 + 1.5691 +U_NAMESPACE_END 1.5692 + 1.5693 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS