intl/icu/source/i18n/rematch.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **************************************************************************
michael@0 3 * Copyright (C) 2002-2013 International Business Machines Corporation *
michael@0 4 * and others. All rights reserved. *
michael@0 5 **************************************************************************
michael@0 6 */
michael@0 7 //
michael@0 8 // file: rematch.cpp
michael@0 9 //
michael@0 10 // Contains the implementation of class RegexMatcher,
michael@0 11 // which is one of the main API classes for the ICU regular expression package.
michael@0 12 //
michael@0 13
michael@0 14 #include "unicode/utypes.h"
michael@0 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 16
michael@0 17 #include "unicode/regex.h"
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/uchar.h"
michael@0 20 #include "unicode/ustring.h"
michael@0 21 #include "unicode/rbbi.h"
michael@0 22 #include "unicode/utf.h"
michael@0 23 #include "unicode/utf16.h"
michael@0 24 #include "uassert.h"
michael@0 25 #include "cmemory.h"
michael@0 26 #include "uvector.h"
michael@0 27 #include "uvectr32.h"
michael@0 28 #include "uvectr64.h"
michael@0 29 #include "regeximp.h"
michael@0 30 #include "regexst.h"
michael@0 31 #include "regextxt.h"
michael@0 32 #include "ucase.h"
michael@0 33
michael@0 34 // #include <malloc.h> // Needed for heapcheck testing
michael@0 35
michael@0 36
michael@0 37 // Find progress callback
michael@0 38 // ----------------------
michael@0 39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
michael@0 40 //
michael@0 41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
michael@0 42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
michael@0 43
michael@0 44
michael@0 45 // Smart Backtracking
michael@0 46 // ------------------
michael@0 47 // When a failure would go back to a LOOP_C instruction,
michael@0 48 // strings, characters, and setrefs scan backwards for a valid start
michael@0 49 // character themselves, pop the stack, and save state, emulating the
michael@0 50 // LOOP_C's effect but assured that the next character of input is a
michael@0 51 // possible matching character.
michael@0 52 //
michael@0 53 // Good idea in theory; unfortunately it only helps out a few specific
michael@0 54 // cases and slows the engine down a little in the rest.
michael@0 55
michael@0 56 U_NAMESPACE_BEGIN
michael@0 57
michael@0 58 // Default limit for the size of the back track stack, to avoid system
michael@0 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
michael@0 60 // This value puts ICU's limits higher than most other regexp implementations,
michael@0 61 // which use recursion rather than the heap, and take more storage per
michael@0 62 // backtrack point.
michael@0 63 //
michael@0 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
michael@0 65
michael@0 66 // Time limit counter constant.
michael@0 67 // Time limits for expression evaluation are in terms of quanta of work by
michael@0 68 // the engine, each of which is 10,000 state saves.
michael@0 69 // This constant determines that state saves per tick number.
michael@0 70 static const int32_t TIMER_INITIAL_VALUE = 10000;
michael@0 71
michael@0 72 //-----------------------------------------------------------------------------
michael@0 73 //
michael@0 74 // Constructor and Destructor
michael@0 75 //
michael@0 76 //-----------------------------------------------------------------------------
michael@0 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
michael@0 78 fDeferredStatus = U_ZERO_ERROR;
michael@0 79 init(fDeferredStatus);
michael@0 80 if (U_FAILURE(fDeferredStatus)) {
michael@0 81 return;
michael@0 82 }
michael@0 83 if (pat==NULL) {
michael@0 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 85 return;
michael@0 86 }
michael@0 87 fPattern = pat;
michael@0 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
michael@0 89 }
michael@0 90
michael@0 91
michael@0 92
michael@0 93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
michael@0 94 uint32_t flags, UErrorCode &status) {
michael@0 95 init(status);
michael@0 96 if (U_FAILURE(status)) {
michael@0 97 return;
michael@0 98 }
michael@0 99 UParseError pe;
michael@0 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
michael@0 101 fPattern = fPatternOwned;
michael@0 102
michael@0 103 UText inputText = UTEXT_INITIALIZER;
michael@0 104 utext_openConstUnicodeString(&inputText, &input, &status);
michael@0 105 init2(&inputText, status);
michael@0 106 utext_close(&inputText);
michael@0 107
michael@0 108 fInputUniStrMaybeMutable = TRUE;
michael@0 109 }
michael@0 110
michael@0 111
michael@0 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
michael@0 113 uint32_t flags, UErrorCode &status) {
michael@0 114 init(status);
michael@0 115 if (U_FAILURE(status)) {
michael@0 116 return;
michael@0 117 }
michael@0 118 UParseError pe;
michael@0 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
michael@0 120 if (U_FAILURE(status)) {
michael@0 121 return;
michael@0 122 }
michael@0 123
michael@0 124 fPattern = fPatternOwned;
michael@0 125 init2(input, status);
michael@0 126 }
michael@0 127
michael@0 128
michael@0 129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
michael@0 130 uint32_t flags, UErrorCode &status) {
michael@0 131 init(status);
michael@0 132 if (U_FAILURE(status)) {
michael@0 133 return;
michael@0 134 }
michael@0 135 UParseError pe;
michael@0 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
michael@0 137 if (U_FAILURE(status)) {
michael@0 138 return;
michael@0 139 }
michael@0 140 fPattern = fPatternOwned;
michael@0 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
michael@0 142 }
michael@0 143
michael@0 144 RegexMatcher::RegexMatcher(UText *regexp,
michael@0 145 uint32_t flags, UErrorCode &status) {
michael@0 146 init(status);
michael@0 147 if (U_FAILURE(status)) {
michael@0 148 return;
michael@0 149 }
michael@0 150 UParseError pe;
michael@0 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
michael@0 152 if (U_FAILURE(status)) {
michael@0 153 return;
michael@0 154 }
michael@0 155
michael@0 156 fPattern = fPatternOwned;
michael@0 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
michael@0 158 }
michael@0 159
michael@0 160
michael@0 161
michael@0 162
michael@0 163 RegexMatcher::~RegexMatcher() {
michael@0 164 delete fStack;
michael@0 165 if (fData != fSmallData) {
michael@0 166 uprv_free(fData);
michael@0 167 fData = NULL;
michael@0 168 }
michael@0 169 if (fPatternOwned) {
michael@0 170 delete fPatternOwned;
michael@0 171 fPatternOwned = NULL;
michael@0 172 fPattern = NULL;
michael@0 173 }
michael@0 174
michael@0 175 if (fInput) {
michael@0 176 delete fInput;
michael@0 177 }
michael@0 178 if (fInputText) {
michael@0 179 utext_close(fInputText);
michael@0 180 }
michael@0 181 if (fAltInputText) {
michael@0 182 utext_close(fAltInputText);
michael@0 183 }
michael@0 184
michael@0 185 #if UCONFIG_NO_BREAK_ITERATION==0
michael@0 186 delete fWordBreakItr;
michael@0 187 #endif
michael@0 188 }
michael@0 189
michael@0 190 //
michael@0 191 // init() common initialization for use by all constructors.
michael@0 192 // Initialize all fields, get the object into a consistent state.
michael@0 193 // This must be done even when the initial status shows an error,
michael@0 194 // so that the object is initialized sufficiently well for the destructor
michael@0 195 // to run safely.
michael@0 196 //
michael@0 197 void RegexMatcher::init(UErrorCode &status) {
michael@0 198 fPattern = NULL;
michael@0 199 fPatternOwned = NULL;
michael@0 200 fFrameSize = 0;
michael@0 201 fRegionStart = 0;
michael@0 202 fRegionLimit = 0;
michael@0 203 fAnchorStart = 0;
michael@0 204 fAnchorLimit = 0;
michael@0 205 fLookStart = 0;
michael@0 206 fLookLimit = 0;
michael@0 207 fActiveStart = 0;
michael@0 208 fActiveLimit = 0;
michael@0 209 fTransparentBounds = FALSE;
michael@0 210 fAnchoringBounds = TRUE;
michael@0 211 fMatch = FALSE;
michael@0 212 fMatchStart = 0;
michael@0 213 fMatchEnd = 0;
michael@0 214 fLastMatchEnd = -1;
michael@0 215 fAppendPosition = 0;
michael@0 216 fHitEnd = FALSE;
michael@0 217 fRequireEnd = FALSE;
michael@0 218 fStack = NULL;
michael@0 219 fFrame = NULL;
michael@0 220 fTimeLimit = 0;
michael@0 221 fTime = 0;
michael@0 222 fTickCounter = 0;
michael@0 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
michael@0 224 fCallbackFn = NULL;
michael@0 225 fCallbackContext = NULL;
michael@0 226 fFindProgressCallbackFn = NULL;
michael@0 227 fFindProgressCallbackContext = NULL;
michael@0 228 fTraceDebug = FALSE;
michael@0 229 fDeferredStatus = status;
michael@0 230 fData = fSmallData;
michael@0 231 fWordBreakItr = NULL;
michael@0 232
michael@0 233 fStack = NULL;
michael@0 234 fInputText = NULL;
michael@0 235 fAltInputText = NULL;
michael@0 236 fInput = NULL;
michael@0 237 fInputLength = 0;
michael@0 238 fInputUniStrMaybeMutable = FALSE;
michael@0 239
michael@0 240 if (U_FAILURE(status)) {
michael@0 241 fDeferredStatus = status;
michael@0 242 }
michael@0 243 }
michael@0 244
michael@0 245 //
michael@0 246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
michael@0 247 // This handles the common setup to be done after the Pattern is available.
michael@0 248 //
michael@0 249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
michael@0 250 if (U_FAILURE(status)) {
michael@0 251 fDeferredStatus = status;
michael@0 252 return;
michael@0 253 }
michael@0 254
michael@0 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
michael@0 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
michael@0 257 if (fData == NULL) {
michael@0 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 259 return;
michael@0 260 }
michael@0 261 }
michael@0 262
michael@0 263 fStack = new UVector64(status);
michael@0 264 if (fStack == NULL) {
michael@0 265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 266 return;
michael@0 267 }
michael@0 268
michael@0 269 reset(input);
michael@0 270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
michael@0 271 if (U_FAILURE(status)) {
michael@0 272 fDeferredStatus = status;
michael@0 273 return;
michael@0 274 }
michael@0 275 }
michael@0 276
michael@0 277
michael@0 278 static const UChar BACKSLASH = 0x5c;
michael@0 279 static const UChar DOLLARSIGN = 0x24;
michael@0 280 //--------------------------------------------------------------------------------
michael@0 281 //
michael@0 282 // appendReplacement
michael@0 283 //
michael@0 284 //--------------------------------------------------------------------------------
michael@0 285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
michael@0 286 const UnicodeString &replacement,
michael@0 287 UErrorCode &status) {
michael@0 288 UText replacementText = UTEXT_INITIALIZER;
michael@0 289
michael@0 290 utext_openConstUnicodeString(&replacementText, &replacement, &status);
michael@0 291 if (U_SUCCESS(status)) {
michael@0 292 UText resultText = UTEXT_INITIALIZER;
michael@0 293 utext_openUnicodeString(&resultText, &dest, &status);
michael@0 294
michael@0 295 if (U_SUCCESS(status)) {
michael@0 296 appendReplacement(&resultText, &replacementText, status);
michael@0 297 utext_close(&resultText);
michael@0 298 }
michael@0 299 utext_close(&replacementText);
michael@0 300 }
michael@0 301
michael@0 302 return *this;
michael@0 303 }
michael@0 304
michael@0 305 //
michael@0 306 // appendReplacement, UText mode
michael@0 307 //
michael@0 308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
michael@0 309 UText *replacement,
michael@0 310 UErrorCode &status) {
michael@0 311 if (U_FAILURE(status)) {
michael@0 312 return *this;
michael@0 313 }
michael@0 314 if (U_FAILURE(fDeferredStatus)) {
michael@0 315 status = fDeferredStatus;
michael@0 316 return *this;
michael@0 317 }
michael@0 318 if (fMatch == FALSE) {
michael@0 319 status = U_REGEX_INVALID_STATE;
michael@0 320 return *this;
michael@0 321 }
michael@0 322
michael@0 323 // Copy input string from the end of previous match to start of current match
michael@0 324 int64_t destLen = utext_nativeLength(dest);
michael@0 325 if (fMatchStart > fAppendPosition) {
michael@0 326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
michael@0 328 (int32_t)(fMatchStart-fAppendPosition), &status);
michael@0 329 } else {
michael@0 330 int32_t len16;
michael@0 331 if (UTEXT_USES_U16(fInputText)) {
michael@0 332 len16 = (int32_t)(fMatchStart-fAppendPosition);
michael@0 333 } else {
michael@0 334 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
michael@0 336 }
michael@0 337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
michael@0 338 if (inputChars == NULL) {
michael@0 339 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 340 return *this;
michael@0 341 }
michael@0 342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
michael@0 343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
michael@0 344 uprv_free(inputChars);
michael@0 345 }
michael@0 346 }
michael@0 347 fAppendPosition = fMatchEnd;
michael@0 348
michael@0 349
michael@0 350 // scan the replacement text, looking for substitutions ($n) and \escapes.
michael@0 351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
michael@0 352 // move entire ranges not containing substitutions.
michael@0 353 UTEXT_SETNATIVEINDEX(replacement, 0);
michael@0 354 UChar32 c = UTEXT_NEXT32(replacement);
michael@0 355 while (c != U_SENTINEL) {
michael@0 356 if (c == BACKSLASH) {
michael@0 357 // Backslash Escape. Copy the following char out without further checks.
michael@0 358 // Note: Surrogate pairs don't need any special handling
michael@0 359 // The second half wont be a '$' or a '\', and
michael@0 360 // will move to the dest normally on the next
michael@0 361 // loop iteration.
michael@0 362 c = UTEXT_CURRENT32(replacement);
michael@0 363 if (c == U_SENTINEL) {
michael@0 364 break;
michael@0 365 }
michael@0 366
michael@0 367 if (c==0x55/*U*/ || c==0x75/*u*/) {
michael@0 368 // We have a \udddd or \Udddddddd escape sequence.
michael@0 369 int32_t offset = 0;
michael@0 370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
michael@0 371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
michael@0 372 if (escapedChar != (UChar32)0xFFFFFFFF) {
michael@0 373 if (U_IS_BMP(escapedChar)) {
michael@0 374 UChar c16 = (UChar)escapedChar;
michael@0 375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
michael@0 376 } else {
michael@0 377 UChar surrogate[2];
michael@0 378 surrogate[0] = U16_LEAD(escapedChar);
michael@0 379 surrogate[1] = U16_TRAIL(escapedChar);
michael@0 380 if (U_SUCCESS(status)) {
michael@0 381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
michael@0 382 }
michael@0 383 }
michael@0 384 // TODO: Report errors for mal-formed \u escapes?
michael@0 385 // As this is, the original sequence is output, which may be OK.
michael@0 386 if (context.lastOffset == offset) {
michael@0 387 (void)UTEXT_PREVIOUS32(replacement);
michael@0 388 } else if (context.lastOffset != offset-1) {
michael@0 389 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
michael@0 390 }
michael@0 391 }
michael@0 392 } else {
michael@0 393 (void)UTEXT_NEXT32(replacement);
michael@0 394 // Plain backslash escape. Just put out the escaped character.
michael@0 395 if (U_IS_BMP(c)) {
michael@0 396 UChar c16 = (UChar)c;
michael@0 397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
michael@0 398 } else {
michael@0 399 UChar surrogate[2];
michael@0 400 surrogate[0] = U16_LEAD(c);
michael@0 401 surrogate[1] = U16_TRAIL(c);
michael@0 402 if (U_SUCCESS(status)) {
michael@0 403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
michael@0 404 }
michael@0 405 }
michael@0 406 }
michael@0 407 } else if (c != DOLLARSIGN) {
michael@0 408 // Normal char, not a $. Copy it out without further checks.
michael@0 409 if (U_IS_BMP(c)) {
michael@0 410 UChar c16 = (UChar)c;
michael@0 411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
michael@0 412 } else {
michael@0 413 UChar surrogate[2];
michael@0 414 surrogate[0] = U16_LEAD(c);
michael@0 415 surrogate[1] = U16_TRAIL(c);
michael@0 416 if (U_SUCCESS(status)) {
michael@0 417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
michael@0 418 }
michael@0 419 }
michael@0 420 } else {
michael@0 421 // We've got a $. Pick up a capture group number if one follows.
michael@0 422 // Consume at most the number of digits necessary for the largest capture
michael@0 423 // number that is valid for this pattern.
michael@0 424
michael@0 425 int32_t numDigits = 0;
michael@0 426 int32_t groupNum = 0;
michael@0 427 UChar32 digitC;
michael@0 428 for (;;) {
michael@0 429 digitC = UTEXT_CURRENT32(replacement);
michael@0 430 if (digitC == U_SENTINEL) {
michael@0 431 break;
michael@0 432 }
michael@0 433 if (u_isdigit(digitC) == FALSE) {
michael@0 434 break;
michael@0 435 }
michael@0 436 (void)UTEXT_NEXT32(replacement);
michael@0 437 groupNum=groupNum*10 + u_charDigitValue(digitC);
michael@0 438 numDigits++;
michael@0 439 if (numDigits >= fPattern->fMaxCaptureDigits) {
michael@0 440 break;
michael@0 441 }
michael@0 442 }
michael@0 443
michael@0 444
michael@0 445 if (numDigits == 0) {
michael@0 446 // The $ didn't introduce a group number at all.
michael@0 447 // Treat it as just part of the substitution text.
michael@0 448 UChar c16 = DOLLARSIGN;
michael@0 449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
michael@0 450 } else {
michael@0 451 // Finally, append the capture group data to the destination.
michael@0 452 destLen += appendGroup(groupNum, dest, status);
michael@0 453 if (U_FAILURE(status)) {
michael@0 454 // Can fail if group number is out of range.
michael@0 455 break;
michael@0 456 }
michael@0 457 }
michael@0 458 }
michael@0 459
michael@0 460 if (U_FAILURE(status)) {
michael@0 461 break;
michael@0 462 } else {
michael@0 463 c = UTEXT_NEXT32(replacement);
michael@0 464 }
michael@0 465 }
michael@0 466
michael@0 467 return *this;
michael@0 468 }
michael@0 469
michael@0 470
michael@0 471
michael@0 472 //--------------------------------------------------------------------------------
michael@0 473 //
michael@0 474 // appendTail Intended to be used in conjunction with appendReplacement()
michael@0 475 // To the destination string, append everything following
michael@0 476 // the last match position from the input string.
michael@0 477 //
michael@0 478 // Note: Match ranges do not affect appendTail or appendReplacement
michael@0 479 //
michael@0 480 //--------------------------------------------------------------------------------
michael@0 481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
michael@0 482 UErrorCode status = U_ZERO_ERROR;
michael@0 483 UText resultText = UTEXT_INITIALIZER;
michael@0 484 utext_openUnicodeString(&resultText, &dest, &status);
michael@0 485
michael@0 486 if (U_SUCCESS(status)) {
michael@0 487 appendTail(&resultText, status);
michael@0 488 utext_close(&resultText);
michael@0 489 }
michael@0 490
michael@0 491 return dest;
michael@0 492 }
michael@0 493
michael@0 494 //
michael@0 495 // appendTail, UText mode
michael@0 496 //
michael@0 497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
michael@0 498 UBool bailOut = FALSE;
michael@0 499 if (U_FAILURE(status)) {
michael@0 500 bailOut = TRUE;
michael@0 501 }
michael@0 502 if (U_FAILURE(fDeferredStatus)) {
michael@0 503 status = fDeferredStatus;
michael@0 504 bailOut = TRUE;
michael@0 505 }
michael@0 506
michael@0 507 if (bailOut) {
michael@0 508 // dest must not be NULL
michael@0 509 if (dest) {
michael@0 510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
michael@0 511 return dest;
michael@0 512 }
michael@0 513 }
michael@0 514
michael@0 515 if (fInputLength > fAppendPosition) {
michael@0 516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 517 int64_t destLen = utext_nativeLength(dest);
michael@0 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
michael@0 519 (int32_t)(fInputLength-fAppendPosition), &status);
michael@0 520 } else {
michael@0 521 int32_t len16;
michael@0 522 if (UTEXT_USES_U16(fInputText)) {
michael@0 523 len16 = (int32_t)(fInputLength-fAppendPosition);
michael@0 524 } else {
michael@0 525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
michael@0 526 status = U_ZERO_ERROR; // buffer overflow
michael@0 527 }
michael@0 528
michael@0 529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
michael@0 530 if (inputChars == NULL) {
michael@0 531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 532 } else {
michael@0 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
michael@0 534 int64_t destLen = utext_nativeLength(dest);
michael@0 535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
michael@0 536 uprv_free(inputChars);
michael@0 537 }
michael@0 538 }
michael@0 539 }
michael@0 540 return dest;
michael@0 541 }
michael@0 542
michael@0 543
michael@0 544
michael@0 545 //--------------------------------------------------------------------------------
michael@0 546 //
michael@0 547 // end
michael@0 548 //
michael@0 549 //--------------------------------------------------------------------------------
michael@0 550 int32_t RegexMatcher::end(UErrorCode &err) const {
michael@0 551 return end(0, err);
michael@0 552 }
michael@0 553
michael@0 554 int64_t RegexMatcher::end64(UErrorCode &err) const {
michael@0 555 return end64(0, err);
michael@0 556 }
michael@0 557
michael@0 558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
michael@0 559 if (U_FAILURE(err)) {
michael@0 560 return -1;
michael@0 561 }
michael@0 562 if (fMatch == FALSE) {
michael@0 563 err = U_REGEX_INVALID_STATE;
michael@0 564 return -1;
michael@0 565 }
michael@0 566 if (group < 0 || group > fPattern->fGroupMap->size()) {
michael@0 567 err = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 568 return -1;
michael@0 569 }
michael@0 570 int64_t e = -1;
michael@0 571 if (group == 0) {
michael@0 572 e = fMatchEnd;
michael@0 573 } else {
michael@0 574 // Get the position within the stack frame of the variables for
michael@0 575 // this capture group.
michael@0 576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
michael@0 577 U_ASSERT(groupOffset < fPattern->fFrameSize);
michael@0 578 U_ASSERT(groupOffset >= 0);
michael@0 579 e = fFrame->fExtra[groupOffset + 1];
michael@0 580 }
michael@0 581
michael@0 582 return e;
michael@0 583 }
michael@0 584
michael@0 585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
michael@0 586 return (int32_t)end64(group, err);
michael@0 587 }
michael@0 588
michael@0 589
michael@0 590 //--------------------------------------------------------------------------------
michael@0 591 //
michael@0 592 // find()
michael@0 593 //
michael@0 594 //--------------------------------------------------------------------------------
michael@0 595 UBool RegexMatcher::find() {
michael@0 596 // Start at the position of the last match end. (Will be zero if the
michael@0 597 // matcher has been reset.)
michael@0 598 //
michael@0 599 if (U_FAILURE(fDeferredStatus)) {
michael@0 600 return FALSE;
michael@0 601 }
michael@0 602
michael@0 603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 604 return findUsingChunk();
michael@0 605 }
michael@0 606
michael@0 607 int64_t startPos = fMatchEnd;
michael@0 608 if (startPos==0) {
michael@0 609 startPos = fActiveStart;
michael@0 610 }
michael@0 611
michael@0 612 if (fMatch) {
michael@0 613 // Save the position of any previous successful match.
michael@0 614 fLastMatchEnd = fMatchEnd;
michael@0 615
michael@0 616 if (fMatchStart == fMatchEnd) {
michael@0 617 // Previous match had zero length. Move start position up one position
michael@0 618 // to avoid sending find() into a loop on zero-length matches.
michael@0 619 if (startPos >= fActiveLimit) {
michael@0 620 fMatch = FALSE;
michael@0 621 fHitEnd = TRUE;
michael@0 622 return FALSE;
michael@0 623 }
michael@0 624 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 625 (void)UTEXT_NEXT32(fInputText);
michael@0 626 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 627 }
michael@0 628 } else {
michael@0 629 if (fLastMatchEnd >= 0) {
michael@0 630 // A previous find() failed to match. Don't try again.
michael@0 631 // (without this test, a pattern with a zero-length match
michael@0 632 // could match again at the end of an input string.)
michael@0 633 fHitEnd = TRUE;
michael@0 634 return FALSE;
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638
michael@0 639 // Compute the position in the input string beyond which a match can not begin, because
michael@0 640 // the minimum length match would extend past the end of the input.
michael@0 641 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
michael@0 642 // Be aware of possible overflows if making changes here.
michael@0 643 int64_t testStartLimit;
michael@0 644 if (UTEXT_USES_U16(fInputText)) {
michael@0 645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
michael@0 646 if (startPos > testStartLimit) {
michael@0 647 fMatch = FALSE;
michael@0 648 fHitEnd = TRUE;
michael@0 649 return FALSE;
michael@0 650 }
michael@0 651 } else {
michael@0 652 // For now, let the matcher discover that it can't match on its own
michael@0 653 // We don't know how long the match len is in native characters
michael@0 654 testStartLimit = fActiveLimit;
michael@0 655 }
michael@0 656
michael@0 657 UChar32 c;
michael@0 658 U_ASSERT(startPos >= 0);
michael@0 659
michael@0 660 switch (fPattern->fStartType) {
michael@0 661 case START_NO_INFO:
michael@0 662 // No optimization was found.
michael@0 663 // Try a match at each input position.
michael@0 664 for (;;) {
michael@0 665 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 666 if (U_FAILURE(fDeferredStatus)) {
michael@0 667 return FALSE;
michael@0 668 }
michael@0 669 if (fMatch) {
michael@0 670 return TRUE;
michael@0 671 }
michael@0 672 if (startPos >= testStartLimit) {
michael@0 673 fHitEnd = TRUE;
michael@0 674 return FALSE;
michael@0 675 }
michael@0 676 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 677 (void)UTEXT_NEXT32(fInputText);
michael@0 678 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 679 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 680 // match at the end of a string, so we must make sure that the loop
michael@0 681 // runs with startPos == testStartLimit the last time through.
michael@0 682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 683 return FALSE;
michael@0 684 }
michael@0 685 U_ASSERT(FALSE);
michael@0 686
michael@0 687 case START_START:
michael@0 688 // Matches are only possible at the start of the input string
michael@0 689 // (pattern begins with ^ or \A)
michael@0 690 if (startPos > fActiveStart) {
michael@0 691 fMatch = FALSE;
michael@0 692 return FALSE;
michael@0 693 }
michael@0 694 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 695 if (U_FAILURE(fDeferredStatus)) {
michael@0 696 return FALSE;
michael@0 697 }
michael@0 698 return fMatch;
michael@0 699
michael@0 700
michael@0 701 case START_SET:
michael@0 702 {
michael@0 703 // Match may start on any char from a pre-computed set.
michael@0 704 U_ASSERT(fPattern->fMinMatchLen > 0);
michael@0 705 int64_t pos;
michael@0 706 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 707 for (;;) {
michael@0 708 c = UTEXT_NEXT32(fInputText);
michael@0 709 pos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 710 // c will be -1 (U_SENTINEL) at end of text, in which case we
michael@0 711 // skip this next block (so we don't have a negative array index)
michael@0 712 // and handle end of text in the following block.
michael@0 713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
michael@0 714 (c>=256 && fPattern->fInitialChars->contains(c)))) {
michael@0 715 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 716 if (U_FAILURE(fDeferredStatus)) {
michael@0 717 return FALSE;
michael@0 718 }
michael@0 719 if (fMatch) {
michael@0 720 return TRUE;
michael@0 721 }
michael@0 722 UTEXT_SETNATIVEINDEX(fInputText, pos);
michael@0 723 }
michael@0 724 if (startPos >= testStartLimit) {
michael@0 725 fMatch = FALSE;
michael@0 726 fHitEnd = TRUE;
michael@0 727 return FALSE;
michael@0 728 }
michael@0 729 startPos = pos;
michael@0 730 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 731 return FALSE;
michael@0 732 }
michael@0 733 }
michael@0 734 U_ASSERT(FALSE);
michael@0 735
michael@0 736 case START_STRING:
michael@0 737 case START_CHAR:
michael@0 738 {
michael@0 739 // Match starts on exactly one char.
michael@0 740 U_ASSERT(fPattern->fMinMatchLen > 0);
michael@0 741 UChar32 theChar = fPattern->fInitialChar;
michael@0 742 int64_t pos;
michael@0 743 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 744 for (;;) {
michael@0 745 c = UTEXT_NEXT32(fInputText);
michael@0 746 pos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 747 if (c == theChar) {
michael@0 748 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 749 if (U_FAILURE(fDeferredStatus)) {
michael@0 750 return FALSE;
michael@0 751 }
michael@0 752 if (fMatch) {
michael@0 753 return TRUE;
michael@0 754 }
michael@0 755 UTEXT_SETNATIVEINDEX(fInputText, pos);
michael@0 756 }
michael@0 757 if (startPos >= testStartLimit) {
michael@0 758 fMatch = FALSE;
michael@0 759 fHitEnd = TRUE;
michael@0 760 return FALSE;
michael@0 761 }
michael@0 762 startPos = pos;
michael@0 763 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 764 return FALSE;
michael@0 765 }
michael@0 766 }
michael@0 767 U_ASSERT(FALSE);
michael@0 768
michael@0 769 case START_LINE:
michael@0 770 {
michael@0 771 UChar32 c;
michael@0 772 if (startPos == fAnchorStart) {
michael@0 773 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 774 if (U_FAILURE(fDeferredStatus)) {
michael@0 775 return FALSE;
michael@0 776 }
michael@0 777 if (fMatch) {
michael@0 778 return TRUE;
michael@0 779 }
michael@0 780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 781 c = UTEXT_NEXT32(fInputText);
michael@0 782 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 783 } else {
michael@0 784 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 785 c = UTEXT_PREVIOUS32(fInputText);
michael@0 786 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 787 }
michael@0 788
michael@0 789 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
michael@0 790 for (;;) {
michael@0 791 if (c == 0x0a) {
michael@0 792 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 793 if (U_FAILURE(fDeferredStatus)) {
michael@0 794 return FALSE;
michael@0 795 }
michael@0 796 if (fMatch) {
michael@0 797 return TRUE;
michael@0 798 }
michael@0 799 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 800 }
michael@0 801 if (startPos >= testStartLimit) {
michael@0 802 fMatch = FALSE;
michael@0 803 fHitEnd = TRUE;
michael@0 804 return FALSE;
michael@0 805 }
michael@0 806 c = UTEXT_NEXT32(fInputText);
michael@0 807 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 808 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 809 // match at the end of a string, so we must make sure that the loop
michael@0 810 // runs with startPos == testStartLimit the last time through.
michael@0 811 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 812 return FALSE;
michael@0 813 }
michael@0 814 } else {
michael@0 815 for (;;) {
michael@0 816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
michael@0 817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
michael@0 818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
michael@0 819 (void)UTEXT_NEXT32(fInputText);
michael@0 820 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 821 }
michael@0 822 MatchAt(startPos, FALSE, fDeferredStatus);
michael@0 823 if (U_FAILURE(fDeferredStatus)) {
michael@0 824 return FALSE;
michael@0 825 }
michael@0 826 if (fMatch) {
michael@0 827 return TRUE;
michael@0 828 }
michael@0 829 UTEXT_SETNATIVEINDEX(fInputText, startPos);
michael@0 830 }
michael@0 831 if (startPos >= testStartLimit) {
michael@0 832 fMatch = FALSE;
michael@0 833 fHitEnd = TRUE;
michael@0 834 return FALSE;
michael@0 835 }
michael@0 836 c = UTEXT_NEXT32(fInputText);
michael@0 837 startPos = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 838 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 839 // match at the end of a string, so we must make sure that the loop
michael@0 840 // runs with startPos == testStartLimit the last time through.
michael@0 841 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 842 return FALSE;
michael@0 843 }
michael@0 844 }
michael@0 845 }
michael@0 846
michael@0 847 default:
michael@0 848 U_ASSERT(FALSE);
michael@0 849 }
michael@0 850
michael@0 851 U_ASSERT(FALSE);
michael@0 852 return FALSE;
michael@0 853 }
michael@0 854
michael@0 855
michael@0 856
michael@0 857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
michael@0 858 if (U_FAILURE(status)) {
michael@0 859 return FALSE;
michael@0 860 }
michael@0 861 if (U_FAILURE(fDeferredStatus)) {
michael@0 862 status = fDeferredStatus;
michael@0 863 return FALSE;
michael@0 864 }
michael@0 865 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
michael@0 866 // This will reset the region to be the full input length.
michael@0 867 if (start < 0) {
michael@0 868 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 869 return FALSE;
michael@0 870 }
michael@0 871
michael@0 872 int64_t nativeStart = start;
michael@0 873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
michael@0 874 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 875 return FALSE;
michael@0 876 }
michael@0 877 fMatchEnd = nativeStart;
michael@0 878 return find();
michael@0 879 }
michael@0 880
michael@0 881
michael@0 882 //--------------------------------------------------------------------------------
michael@0 883 //
michael@0 884 // findUsingChunk() -- like find(), but with the advance knowledge that the
michael@0 885 // entire string is available in the UText's chunk buffer.
michael@0 886 //
michael@0 887 //--------------------------------------------------------------------------------
michael@0 888 UBool RegexMatcher::findUsingChunk() {
michael@0 889 // Start at the position of the last match end. (Will be zero if the
michael@0 890 // matcher has been reset.
michael@0 891 //
michael@0 892
michael@0 893 int32_t startPos = (int32_t)fMatchEnd;
michael@0 894 if (startPos==0) {
michael@0 895 startPos = (int32_t)fActiveStart;
michael@0 896 }
michael@0 897
michael@0 898 const UChar *inputBuf = fInputText->chunkContents;
michael@0 899
michael@0 900 if (fMatch) {
michael@0 901 // Save the position of any previous successful match.
michael@0 902 fLastMatchEnd = fMatchEnd;
michael@0 903
michael@0 904 if (fMatchStart == fMatchEnd) {
michael@0 905 // Previous match had zero length. Move start position up one position
michael@0 906 // to avoid sending find() into a loop on zero-length matches.
michael@0 907 if (startPos >= fActiveLimit) {
michael@0 908 fMatch = FALSE;
michael@0 909 fHitEnd = TRUE;
michael@0 910 return FALSE;
michael@0 911 }
michael@0 912 U16_FWD_1(inputBuf, startPos, fInputLength);
michael@0 913 }
michael@0 914 } else {
michael@0 915 if (fLastMatchEnd >= 0) {
michael@0 916 // A previous find() failed to match. Don't try again.
michael@0 917 // (without this test, a pattern with a zero-length match
michael@0 918 // could match again at the end of an input string.)
michael@0 919 fHitEnd = TRUE;
michael@0 920 return FALSE;
michael@0 921 }
michael@0 922 }
michael@0 923
michael@0 924
michael@0 925 // Compute the position in the input string beyond which a match can not begin, because
michael@0 926 // the minimum length match would extend past the end of the input.
michael@0 927 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
michael@0 928 // Be aware of possible overflows if making changes here.
michael@0 929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
michael@0 930 if (startPos > testLen) {
michael@0 931 fMatch = FALSE;
michael@0 932 fHitEnd = TRUE;
michael@0 933 return FALSE;
michael@0 934 }
michael@0 935
michael@0 936 UChar32 c;
michael@0 937 U_ASSERT(startPos >= 0);
michael@0 938
michael@0 939 switch (fPattern->fStartType) {
michael@0 940 case START_NO_INFO:
michael@0 941 // No optimization was found.
michael@0 942 // Try a match at each input position.
michael@0 943 for (;;) {
michael@0 944 MatchChunkAt(startPos, FALSE, fDeferredStatus);
michael@0 945 if (U_FAILURE(fDeferredStatus)) {
michael@0 946 return FALSE;
michael@0 947 }
michael@0 948 if (fMatch) {
michael@0 949 return TRUE;
michael@0 950 }
michael@0 951 if (startPos >= testLen) {
michael@0 952 fHitEnd = TRUE;
michael@0 953 return FALSE;
michael@0 954 }
michael@0 955 U16_FWD_1(inputBuf, startPos, fActiveLimit);
michael@0 956 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 957 // match at the end of a string, so we must make sure that the loop
michael@0 958 // runs with startPos == testLen the last time through.
michael@0 959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 960 return FALSE;
michael@0 961 }
michael@0 962 U_ASSERT(FALSE);
michael@0 963
michael@0 964 case START_START:
michael@0 965 // Matches are only possible at the start of the input string
michael@0 966 // (pattern begins with ^ or \A)
michael@0 967 if (startPos > fActiveStart) {
michael@0 968 fMatch = FALSE;
michael@0 969 return FALSE;
michael@0 970 }
michael@0 971 MatchChunkAt(startPos, FALSE, fDeferredStatus);
michael@0 972 if (U_FAILURE(fDeferredStatus)) {
michael@0 973 return FALSE;
michael@0 974 }
michael@0 975 return fMatch;
michael@0 976
michael@0 977
michael@0 978 case START_SET:
michael@0 979 {
michael@0 980 // Match may start on any char from a pre-computed set.
michael@0 981 U_ASSERT(fPattern->fMinMatchLen > 0);
michael@0 982 for (;;) {
michael@0 983 int32_t pos = startPos;
michael@0 984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
michael@0 985 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
michael@0 986 (c>=256 && fPattern->fInitialChars->contains(c))) {
michael@0 987 MatchChunkAt(pos, FALSE, fDeferredStatus);
michael@0 988 if (U_FAILURE(fDeferredStatus)) {
michael@0 989 return FALSE;
michael@0 990 }
michael@0 991 if (fMatch) {
michael@0 992 return TRUE;
michael@0 993 }
michael@0 994 }
michael@0 995 if (pos >= testLen) {
michael@0 996 fMatch = FALSE;
michael@0 997 fHitEnd = TRUE;
michael@0 998 return FALSE;
michael@0 999 }
michael@0 1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 1001 return FALSE;
michael@0 1002 }
michael@0 1003 }
michael@0 1004 U_ASSERT(FALSE);
michael@0 1005
michael@0 1006 case START_STRING:
michael@0 1007 case START_CHAR:
michael@0 1008 {
michael@0 1009 // Match starts on exactly one char.
michael@0 1010 U_ASSERT(fPattern->fMinMatchLen > 0);
michael@0 1011 UChar32 theChar = fPattern->fInitialChar;
michael@0 1012 for (;;) {
michael@0 1013 int32_t pos = startPos;
michael@0 1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
michael@0 1015 if (c == theChar) {
michael@0 1016 MatchChunkAt(pos, FALSE, fDeferredStatus);
michael@0 1017 if (U_FAILURE(fDeferredStatus)) {
michael@0 1018 return FALSE;
michael@0 1019 }
michael@0 1020 if (fMatch) {
michael@0 1021 return TRUE;
michael@0 1022 }
michael@0 1023 }
michael@0 1024 if (pos >= testLen) {
michael@0 1025 fMatch = FALSE;
michael@0 1026 fHitEnd = TRUE;
michael@0 1027 return FALSE;
michael@0 1028 }
michael@0 1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 1030 return FALSE;
michael@0 1031 }
michael@0 1032 }
michael@0 1033 U_ASSERT(FALSE);
michael@0 1034
michael@0 1035 case START_LINE:
michael@0 1036 {
michael@0 1037 UChar32 c;
michael@0 1038 if (startPos == fAnchorStart) {
michael@0 1039 MatchChunkAt(startPos, FALSE, fDeferredStatus);
michael@0 1040 if (U_FAILURE(fDeferredStatus)) {
michael@0 1041 return FALSE;
michael@0 1042 }
michael@0 1043 if (fMatch) {
michael@0 1044 return TRUE;
michael@0 1045 }
michael@0 1046 U16_FWD_1(inputBuf, startPos, fActiveLimit);
michael@0 1047 }
michael@0 1048
michael@0 1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
michael@0 1050 for (;;) {
michael@0 1051 c = inputBuf[startPos-1];
michael@0 1052 if (c == 0x0a) {
michael@0 1053 MatchChunkAt(startPos, FALSE, fDeferredStatus);
michael@0 1054 if (U_FAILURE(fDeferredStatus)) {
michael@0 1055 return FALSE;
michael@0 1056 }
michael@0 1057 if (fMatch) {
michael@0 1058 return TRUE;
michael@0 1059 }
michael@0 1060 }
michael@0 1061 if (startPos >= testLen) {
michael@0 1062 fMatch = FALSE;
michael@0 1063 fHitEnd = TRUE;
michael@0 1064 return FALSE;
michael@0 1065 }
michael@0 1066 U16_FWD_1(inputBuf, startPos, fActiveLimit);
michael@0 1067 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 1068 // match at the end of a string, so we must make sure that the loop
michael@0 1069 // runs with startPos == testLen the last time through.
michael@0 1070 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 1071 return FALSE;
michael@0 1072 }
michael@0 1073 } else {
michael@0 1074 for (;;) {
michael@0 1075 c = inputBuf[startPos-1];
michael@0 1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
michael@0 1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
michael@0 1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
michael@0 1079 startPos++;
michael@0 1080 }
michael@0 1081 MatchChunkAt(startPos, FALSE, fDeferredStatus);
michael@0 1082 if (U_FAILURE(fDeferredStatus)) {
michael@0 1083 return FALSE;
michael@0 1084 }
michael@0 1085 if (fMatch) {
michael@0 1086 return TRUE;
michael@0 1087 }
michael@0 1088 }
michael@0 1089 if (startPos >= testLen) {
michael@0 1090 fMatch = FALSE;
michael@0 1091 fHitEnd = TRUE;
michael@0 1092 return FALSE;
michael@0 1093 }
michael@0 1094 U16_FWD_1(inputBuf, startPos, fActiveLimit);
michael@0 1095 // Note that it's perfectly OK for a pattern to have a zero-length
michael@0 1096 // match at the end of a string, so we must make sure that the loop
michael@0 1097 // runs with startPos == testLen the last time through.
michael@0 1098 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
michael@0 1099 return FALSE;
michael@0 1100 }
michael@0 1101 }
michael@0 1102 }
michael@0 1103
michael@0 1104 default:
michael@0 1105 U_ASSERT(FALSE);
michael@0 1106 }
michael@0 1107
michael@0 1108 U_ASSERT(FALSE);
michael@0 1109 return FALSE;
michael@0 1110 }
michael@0 1111
michael@0 1112
michael@0 1113
michael@0 1114 //--------------------------------------------------------------------------------
michael@0 1115 //
michael@0 1116 // group()
michael@0 1117 //
michael@0 1118 //--------------------------------------------------------------------------------
michael@0 1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
michael@0 1120 return group(0, status);
michael@0 1121 }
michael@0 1122
michael@0 1123 // Return immutable shallow clone
michael@0 1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
michael@0 1125 return group(0, dest, group_len, status);
michael@0 1126 }
michael@0 1127
michael@0 1128 // Return immutable shallow clone
michael@0 1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
michael@0 1130 group_len = 0;
michael@0 1131 UBool bailOut = FALSE;
michael@0 1132 if (U_FAILURE(status)) {
michael@0 1133 return dest;
michael@0 1134 }
michael@0 1135 if (U_FAILURE(fDeferredStatus)) {
michael@0 1136 status = fDeferredStatus;
michael@0 1137 bailOut = TRUE;
michael@0 1138 }
michael@0 1139 if (fMatch == FALSE) {
michael@0 1140 status = U_REGEX_INVALID_STATE;
michael@0 1141 bailOut = TRUE;
michael@0 1142 }
michael@0 1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
michael@0 1144 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1145 bailOut = TRUE;
michael@0 1146 }
michael@0 1147
michael@0 1148 if (bailOut) {
michael@0 1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
michael@0 1150 }
michael@0 1151
michael@0 1152 int64_t s, e;
michael@0 1153 if (groupNum == 0) {
michael@0 1154 s = fMatchStart;
michael@0 1155 e = fMatchEnd;
michael@0 1156 } else {
michael@0 1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
michael@0 1158 U_ASSERT(groupOffset < fPattern->fFrameSize);
michael@0 1159 U_ASSERT(groupOffset >= 0);
michael@0 1160 s = fFrame->fExtra[groupOffset];
michael@0 1161 e = fFrame->fExtra[groupOffset+1];
michael@0 1162 }
michael@0 1163
michael@0 1164 if (s < 0) {
michael@0 1165 // A capture group wasn't part of the match
michael@0 1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
michael@0 1167 }
michael@0 1168 U_ASSERT(s <= e);
michael@0 1169 group_len = e - s;
michael@0 1170
michael@0 1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
michael@0 1172 if (dest)
michael@0 1173 UTEXT_SETNATIVEINDEX(dest, s);
michael@0 1174 return dest;
michael@0 1175 }
michael@0 1176
michael@0 1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
michael@0 1178 UnicodeString result;
michael@0 1179 if (U_FAILURE(status)) {
michael@0 1180 return result;
michael@0 1181 }
michael@0 1182 UText resultText = UTEXT_INITIALIZER;
michael@0 1183 utext_openUnicodeString(&resultText, &result, &status);
michael@0 1184 group(groupNum, &resultText, status);
michael@0 1185 utext_close(&resultText);
michael@0 1186 return result;
michael@0 1187 }
michael@0 1188
michael@0 1189
michael@0 1190 // Return deep (mutable) clone
michael@0 1191 // Technology Preview (as an API), but note that the UnicodeString API is implemented
michael@0 1192 // using this function.
michael@0 1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
michael@0 1194 UBool bailOut = FALSE;
michael@0 1195 if (U_FAILURE(status)) {
michael@0 1196 return dest;
michael@0 1197 }
michael@0 1198 if (U_FAILURE(fDeferredStatus)) {
michael@0 1199 status = fDeferredStatus;
michael@0 1200 bailOut = TRUE;
michael@0 1201 }
michael@0 1202
michael@0 1203 if (fMatch == FALSE) {
michael@0 1204 status = U_REGEX_INVALID_STATE;
michael@0 1205 bailOut = TRUE;
michael@0 1206 }
michael@0 1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
michael@0 1208 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1209 bailOut = TRUE;
michael@0 1210 }
michael@0 1211
michael@0 1212 if (bailOut) {
michael@0 1213 if (dest) {
michael@0 1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
michael@0 1215 return dest;
michael@0 1216 } else {
michael@0 1217 return utext_openUChars(NULL, NULL, 0, &status);
michael@0 1218 }
michael@0 1219 }
michael@0 1220
michael@0 1221 int64_t s, e;
michael@0 1222 if (groupNum == 0) {
michael@0 1223 s = fMatchStart;
michael@0 1224 e = fMatchEnd;
michael@0 1225 } else {
michael@0 1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
michael@0 1227 U_ASSERT(groupOffset < fPattern->fFrameSize);
michael@0 1228 U_ASSERT(groupOffset >= 0);
michael@0 1229 s = fFrame->fExtra[groupOffset];
michael@0 1230 e = fFrame->fExtra[groupOffset+1];
michael@0 1231 }
michael@0 1232
michael@0 1233 if (s < 0) {
michael@0 1234 // A capture group wasn't part of the match
michael@0 1235 if (dest) {
michael@0 1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
michael@0 1237 return dest;
michael@0 1238 } else {
michael@0 1239 return utext_openUChars(NULL, NULL, 0, &status);
michael@0 1240 }
michael@0 1241 }
michael@0 1242 U_ASSERT(s <= e);
michael@0 1243
michael@0 1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1245 U_ASSERT(e <= fInputLength);
michael@0 1246 if (dest) {
michael@0 1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
michael@0 1248 } else {
michael@0 1249 UText groupText = UTEXT_INITIALIZER;
michael@0 1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
michael@0 1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
michael@0 1252 utext_close(&groupText);
michael@0 1253 }
michael@0 1254 } else {
michael@0 1255 int32_t len16;
michael@0 1256 if (UTEXT_USES_U16(fInputText)) {
michael@0 1257 len16 = (int32_t)(e-s);
michael@0 1258 } else {
michael@0 1259 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 1260 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
michael@0 1261 }
michael@0 1262 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
michael@0 1263 if (groupChars == NULL) {
michael@0 1264 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1265 return dest;
michael@0 1266 }
michael@0 1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
michael@0 1268
michael@0 1269 if (dest) {
michael@0 1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
michael@0 1271 } else {
michael@0 1272 UText groupText = UTEXT_INITIALIZER;
michael@0 1273 utext_openUChars(&groupText, groupChars, len16, &status);
michael@0 1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
michael@0 1275 utext_close(&groupText);
michael@0 1276 }
michael@0 1277
michael@0 1278 uprv_free(groupChars);
michael@0 1279 }
michael@0 1280 return dest;
michael@0 1281 }
michael@0 1282
michael@0 1283 //--------------------------------------------------------------------------------
michael@0 1284 //
michael@0 1285 // appendGroup() -- currently internal only, appends a group to a UText rather
michael@0 1286 // than replacing its contents
michael@0 1287 //
michael@0 1288 //--------------------------------------------------------------------------------
michael@0 1289
michael@0 1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
michael@0 1291 if (U_FAILURE(status)) {
michael@0 1292 return 0;
michael@0 1293 }
michael@0 1294 if (U_FAILURE(fDeferredStatus)) {
michael@0 1295 status = fDeferredStatus;
michael@0 1296 return 0;
michael@0 1297 }
michael@0 1298 int64_t destLen = utext_nativeLength(dest);
michael@0 1299
michael@0 1300 if (fMatch == FALSE) {
michael@0 1301 status = U_REGEX_INVALID_STATE;
michael@0 1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
michael@0 1303 }
michael@0 1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
michael@0 1305 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
michael@0 1307 }
michael@0 1308
michael@0 1309 int64_t s, e;
michael@0 1310 if (groupNum == 0) {
michael@0 1311 s = fMatchStart;
michael@0 1312 e = fMatchEnd;
michael@0 1313 } else {
michael@0 1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
michael@0 1315 U_ASSERT(groupOffset < fPattern->fFrameSize);
michael@0 1316 U_ASSERT(groupOffset >= 0);
michael@0 1317 s = fFrame->fExtra[groupOffset];
michael@0 1318 e = fFrame->fExtra[groupOffset+1];
michael@0 1319 }
michael@0 1320
michael@0 1321 if (s < 0) {
michael@0 1322 // A capture group wasn't part of the match
michael@0 1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
michael@0 1324 }
michael@0 1325 U_ASSERT(s <= e);
michael@0 1326
michael@0 1327 int64_t deltaLen;
michael@0 1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1329 U_ASSERT(e <= fInputLength);
michael@0 1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
michael@0 1331 } else {
michael@0 1332 int32_t len16;
michael@0 1333 if (UTEXT_USES_U16(fInputText)) {
michael@0 1334 len16 = (int32_t)(e-s);
michael@0 1335 } else {
michael@0 1336 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
michael@0 1338 }
michael@0 1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
michael@0 1340 if (groupChars == NULL) {
michael@0 1341 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1342 return 0;
michael@0 1343 }
michael@0 1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
michael@0 1345
michael@0 1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
michael@0 1347 uprv_free(groupChars);
michael@0 1348 }
michael@0 1349 return deltaLen;
michael@0 1350 }
michael@0 1351
michael@0 1352
michael@0 1353
michael@0 1354 //--------------------------------------------------------------------------------
michael@0 1355 //
michael@0 1356 // groupCount()
michael@0 1357 //
michael@0 1358 //--------------------------------------------------------------------------------
michael@0 1359 int32_t RegexMatcher::groupCount() const {
michael@0 1360 return fPattern->fGroupMap->size();
michael@0 1361 }
michael@0 1362
michael@0 1363
michael@0 1364
michael@0 1365 //--------------------------------------------------------------------------------
michael@0 1366 //
michael@0 1367 // hasAnchoringBounds()
michael@0 1368 //
michael@0 1369 //--------------------------------------------------------------------------------
michael@0 1370 UBool RegexMatcher::hasAnchoringBounds() const {
michael@0 1371 return fAnchoringBounds;
michael@0 1372 }
michael@0 1373
michael@0 1374
michael@0 1375 //--------------------------------------------------------------------------------
michael@0 1376 //
michael@0 1377 // hasTransparentBounds()
michael@0 1378 //
michael@0 1379 //--------------------------------------------------------------------------------
michael@0 1380 UBool RegexMatcher::hasTransparentBounds() const {
michael@0 1381 return fTransparentBounds;
michael@0 1382 }
michael@0 1383
michael@0 1384
michael@0 1385
michael@0 1386 //--------------------------------------------------------------------------------
michael@0 1387 //
michael@0 1388 // hitEnd()
michael@0 1389 //
michael@0 1390 //--------------------------------------------------------------------------------
michael@0 1391 UBool RegexMatcher::hitEnd() const {
michael@0 1392 return fHitEnd;
michael@0 1393 }
michael@0 1394
michael@0 1395
michael@0 1396 //--------------------------------------------------------------------------------
michael@0 1397 //
michael@0 1398 // input()
michael@0 1399 //
michael@0 1400 //--------------------------------------------------------------------------------
michael@0 1401 const UnicodeString &RegexMatcher::input() const {
michael@0 1402 if (!fInput) {
michael@0 1403 UErrorCode status = U_ZERO_ERROR;
michael@0 1404 int32_t len16;
michael@0 1405 if (UTEXT_USES_U16(fInputText)) {
michael@0 1406 len16 = (int32_t)fInputLength;
michael@0 1407 } else {
michael@0 1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
michael@0 1409 status = U_ZERO_ERROR; // overflow, length status
michael@0 1410 }
michael@0 1411 UnicodeString *result = new UnicodeString(len16, 0, 0);
michael@0 1412
michael@0 1413 UChar *inputChars = result->getBuffer(len16);
michael@0 1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
michael@0 1415 result->releaseBuffer(len16);
michael@0 1416
michael@0 1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
michael@0 1418 }
michael@0 1419
michael@0 1420 return *fInput;
michael@0 1421 }
michael@0 1422
michael@0 1423 //--------------------------------------------------------------------------------
michael@0 1424 //
michael@0 1425 // inputText()
michael@0 1426 //
michael@0 1427 //--------------------------------------------------------------------------------
michael@0 1428 UText *RegexMatcher::inputText() const {
michael@0 1429 return fInputText;
michael@0 1430 }
michael@0 1431
michael@0 1432
michael@0 1433 //--------------------------------------------------------------------------------
michael@0 1434 //
michael@0 1435 // getInput() -- like inputText(), but makes a clone or copies into another UText
michael@0 1436 //
michael@0 1437 //--------------------------------------------------------------------------------
michael@0 1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
michael@0 1439 UBool bailOut = FALSE;
michael@0 1440 if (U_FAILURE(status)) {
michael@0 1441 return dest;
michael@0 1442 }
michael@0 1443 if (U_FAILURE(fDeferredStatus)) {
michael@0 1444 status = fDeferredStatus;
michael@0 1445 bailOut = TRUE;
michael@0 1446 }
michael@0 1447
michael@0 1448 if (bailOut) {
michael@0 1449 if (dest) {
michael@0 1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
michael@0 1451 return dest;
michael@0 1452 } else {
michael@0 1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
michael@0 1454 }
michael@0 1455 }
michael@0 1456
michael@0 1457 if (dest) {
michael@0 1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
michael@0 1460 } else {
michael@0 1461 int32_t input16Len;
michael@0 1462 if (UTEXT_USES_U16(fInputText)) {
michael@0 1463 input16Len = (int32_t)fInputLength;
michael@0 1464 } else {
michael@0 1465 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
michael@0 1467 }
michael@0 1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
michael@0 1469 if (inputChars == NULL) {
michael@0 1470 return dest;
michael@0 1471 }
michael@0 1472
michael@0 1473 status = U_ZERO_ERROR;
michael@0 1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
michael@0 1475 status = U_ZERO_ERROR;
michael@0 1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
michael@0 1477
michael@0 1478 uprv_free(inputChars);
michael@0 1479 }
michael@0 1480 return dest;
michael@0 1481 } else {
michael@0 1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
michael@0 1483 }
michael@0 1484 }
michael@0 1485
michael@0 1486
michael@0 1487 static UBool compat_SyncMutableUTextContents(UText *ut);
michael@0 1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
michael@0 1489 UBool retVal = FALSE;
michael@0 1490
michael@0 1491 // In the following test, we're really only interested in whether the UText should switch
michael@0 1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
michael@0 1493 // will still point to the correct data.
michael@0 1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
michael@0 1495 UnicodeString *us=(UnicodeString *)ut->context;
michael@0 1496
michael@0 1497 // Update to the latest length.
michael@0 1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
michael@0 1499 int32_t newLength = us->length();
michael@0 1500
michael@0 1501 // Update the chunk description.
michael@0 1502 // The buffer may have switched between stack- and heap-based.
michael@0 1503 ut->chunkContents = us->getBuffer();
michael@0 1504 ut->chunkLength = newLength;
michael@0 1505 ut->chunkNativeLimit = newLength;
michael@0 1506 ut->nativeIndexingLimit = newLength;
michael@0 1507 retVal = TRUE;
michael@0 1508 }
michael@0 1509
michael@0 1510 return retVal;
michael@0 1511 }
michael@0 1512
michael@0 1513 //--------------------------------------------------------------------------------
michael@0 1514 //
michael@0 1515 // lookingAt()
michael@0 1516 //
michael@0 1517 //--------------------------------------------------------------------------------
michael@0 1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
michael@0 1519 if (U_FAILURE(status)) {
michael@0 1520 return FALSE;
michael@0 1521 }
michael@0 1522 if (U_FAILURE(fDeferredStatus)) {
michael@0 1523 status = fDeferredStatus;
michael@0 1524 return FALSE;
michael@0 1525 }
michael@0 1526
michael@0 1527 if (fInputUniStrMaybeMutable) {
michael@0 1528 if (compat_SyncMutableUTextContents(fInputText)) {
michael@0 1529 fInputLength = utext_nativeLength(fInputText);
michael@0 1530 reset();
michael@0 1531 }
michael@0 1532 }
michael@0 1533 else {
michael@0 1534 resetPreserveRegion();
michael@0 1535 }
michael@0 1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
michael@0 1538 } else {
michael@0 1539 MatchAt(fActiveStart, FALSE, status);
michael@0 1540 }
michael@0 1541 return fMatch;
michael@0 1542 }
michael@0 1543
michael@0 1544
michael@0 1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
michael@0 1546 if (U_FAILURE(status)) {
michael@0 1547 return FALSE;
michael@0 1548 }
michael@0 1549 if (U_FAILURE(fDeferredStatus)) {
michael@0 1550 status = fDeferredStatus;
michael@0 1551 return FALSE;
michael@0 1552 }
michael@0 1553 reset();
michael@0 1554
michael@0 1555 if (start < 0) {
michael@0 1556 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1557 return FALSE;
michael@0 1558 }
michael@0 1559
michael@0 1560 if (fInputUniStrMaybeMutable) {
michael@0 1561 if (compat_SyncMutableUTextContents(fInputText)) {
michael@0 1562 fInputLength = utext_nativeLength(fInputText);
michael@0 1563 reset();
michael@0 1564 }
michael@0 1565 }
michael@0 1566
michael@0 1567 int64_t nativeStart;
michael@0 1568 nativeStart = start;
michael@0 1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
michael@0 1570 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1571 return FALSE;
michael@0 1572 }
michael@0 1573
michael@0 1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1575 MatchChunkAt((int32_t)nativeStart, FALSE, status);
michael@0 1576 } else {
michael@0 1577 MatchAt(nativeStart, FALSE, status);
michael@0 1578 }
michael@0 1579 return fMatch;
michael@0 1580 }
michael@0 1581
michael@0 1582
michael@0 1583
michael@0 1584 //--------------------------------------------------------------------------------
michael@0 1585 //
michael@0 1586 // matches()
michael@0 1587 //
michael@0 1588 //--------------------------------------------------------------------------------
michael@0 1589 UBool RegexMatcher::matches(UErrorCode &status) {
michael@0 1590 if (U_FAILURE(status)) {
michael@0 1591 return FALSE;
michael@0 1592 }
michael@0 1593 if (U_FAILURE(fDeferredStatus)) {
michael@0 1594 status = fDeferredStatus;
michael@0 1595 return FALSE;
michael@0 1596 }
michael@0 1597
michael@0 1598 if (fInputUniStrMaybeMutable) {
michael@0 1599 if (compat_SyncMutableUTextContents(fInputText)) {
michael@0 1600 fInputLength = utext_nativeLength(fInputText);
michael@0 1601 reset();
michael@0 1602 }
michael@0 1603 }
michael@0 1604 else {
michael@0 1605 resetPreserveRegion();
michael@0 1606 }
michael@0 1607
michael@0 1608 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1609 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
michael@0 1610 } else {
michael@0 1611 MatchAt(fActiveStart, TRUE, status);
michael@0 1612 }
michael@0 1613 return fMatch;
michael@0 1614 }
michael@0 1615
michael@0 1616
michael@0 1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
michael@0 1618 if (U_FAILURE(status)) {
michael@0 1619 return FALSE;
michael@0 1620 }
michael@0 1621 if (U_FAILURE(fDeferredStatus)) {
michael@0 1622 status = fDeferredStatus;
michael@0 1623 return FALSE;
michael@0 1624 }
michael@0 1625 reset();
michael@0 1626
michael@0 1627 if (start < 0) {
michael@0 1628 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1629 return FALSE;
michael@0 1630 }
michael@0 1631
michael@0 1632 if (fInputUniStrMaybeMutable) {
michael@0 1633 if (compat_SyncMutableUTextContents(fInputText)) {
michael@0 1634 fInputLength = utext_nativeLength(fInputText);
michael@0 1635 reset();
michael@0 1636 }
michael@0 1637 }
michael@0 1638
michael@0 1639 int64_t nativeStart;
michael@0 1640 nativeStart = start;
michael@0 1641 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
michael@0 1642 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1643 return FALSE;
michael@0 1644 }
michael@0 1645
michael@0 1646 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
michael@0 1647 MatchChunkAt((int32_t)nativeStart, TRUE, status);
michael@0 1648 } else {
michael@0 1649 MatchAt(nativeStart, TRUE, status);
michael@0 1650 }
michael@0 1651 return fMatch;
michael@0 1652 }
michael@0 1653
michael@0 1654
michael@0 1655
michael@0 1656 //--------------------------------------------------------------------------------
michael@0 1657 //
michael@0 1658 // pattern
michael@0 1659 //
michael@0 1660 //--------------------------------------------------------------------------------
michael@0 1661 const RegexPattern &RegexMatcher::pattern() const {
michael@0 1662 return *fPattern;
michael@0 1663 }
michael@0 1664
michael@0 1665
michael@0 1666
michael@0 1667 //--------------------------------------------------------------------------------
michael@0 1668 //
michael@0 1669 // region
michael@0 1670 //
michael@0 1671 //--------------------------------------------------------------------------------
michael@0 1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
michael@0 1673 if (U_FAILURE(status)) {
michael@0 1674 return *this;
michael@0 1675 }
michael@0 1676
michael@0 1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
michael@0 1678 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1679 }
michael@0 1680
michael@0 1681 int64_t nativeStart = regionStart;
michael@0 1682 int64_t nativeLimit = regionLimit;
michael@0 1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
michael@0 1684 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1685 }
michael@0 1686
michael@0 1687 if (startIndex == -1)
michael@0 1688 this->reset();
michael@0 1689 else
michael@0 1690 resetPreserveRegion();
michael@0 1691
michael@0 1692 fRegionStart = nativeStart;
michael@0 1693 fRegionLimit = nativeLimit;
michael@0 1694 fActiveStart = nativeStart;
michael@0 1695 fActiveLimit = nativeLimit;
michael@0 1696
michael@0 1697 if (startIndex != -1) {
michael@0 1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
michael@0 1699 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1700 }
michael@0 1701 fMatchEnd = startIndex;
michael@0 1702 }
michael@0 1703
michael@0 1704 if (!fTransparentBounds) {
michael@0 1705 fLookStart = nativeStart;
michael@0 1706 fLookLimit = nativeLimit;
michael@0 1707 }
michael@0 1708 if (fAnchoringBounds) {
michael@0 1709 fAnchorStart = nativeStart;
michael@0 1710 fAnchorLimit = nativeLimit;
michael@0 1711 }
michael@0 1712 return *this;
michael@0 1713 }
michael@0 1714
michael@0 1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
michael@0 1716 return region(start, limit, -1, status);
michael@0 1717 }
michael@0 1718
michael@0 1719 //--------------------------------------------------------------------------------
michael@0 1720 //
michael@0 1721 // regionEnd
michael@0 1722 //
michael@0 1723 //--------------------------------------------------------------------------------
michael@0 1724 int32_t RegexMatcher::regionEnd() const {
michael@0 1725 return (int32_t)fRegionLimit;
michael@0 1726 }
michael@0 1727
michael@0 1728 int64_t RegexMatcher::regionEnd64() const {
michael@0 1729 return fRegionLimit;
michael@0 1730 }
michael@0 1731
michael@0 1732 //--------------------------------------------------------------------------------
michael@0 1733 //
michael@0 1734 // regionStart
michael@0 1735 //
michael@0 1736 //--------------------------------------------------------------------------------
michael@0 1737 int32_t RegexMatcher::regionStart() const {
michael@0 1738 return (int32_t)fRegionStart;
michael@0 1739 }
michael@0 1740
michael@0 1741 int64_t RegexMatcher::regionStart64() const {
michael@0 1742 return fRegionStart;
michael@0 1743 }
michael@0 1744
michael@0 1745
michael@0 1746 //--------------------------------------------------------------------------------
michael@0 1747 //
michael@0 1748 // replaceAll
michael@0 1749 //
michael@0 1750 //--------------------------------------------------------------------------------
michael@0 1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
michael@0 1752 UText replacementText = UTEXT_INITIALIZER;
michael@0 1753 UText resultText = UTEXT_INITIALIZER;
michael@0 1754 UnicodeString resultString;
michael@0 1755 if (U_FAILURE(status)) {
michael@0 1756 return resultString;
michael@0 1757 }
michael@0 1758
michael@0 1759 utext_openConstUnicodeString(&replacementText, &replacement, &status);
michael@0 1760 utext_openUnicodeString(&resultText, &resultString, &status);
michael@0 1761
michael@0 1762 replaceAll(&replacementText, &resultText, status);
michael@0 1763
michael@0 1764 utext_close(&resultText);
michael@0 1765 utext_close(&replacementText);
michael@0 1766
michael@0 1767 return resultString;
michael@0 1768 }
michael@0 1769
michael@0 1770
michael@0 1771 //
michael@0 1772 // replaceAll, UText mode
michael@0 1773 //
michael@0 1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
michael@0 1775 if (U_FAILURE(status)) {
michael@0 1776 return dest;
michael@0 1777 }
michael@0 1778 if (U_FAILURE(fDeferredStatus)) {
michael@0 1779 status = fDeferredStatus;
michael@0 1780 return dest;
michael@0 1781 }
michael@0 1782
michael@0 1783 if (dest == NULL) {
michael@0 1784 UnicodeString emptyString;
michael@0 1785 UText empty = UTEXT_INITIALIZER;
michael@0 1786
michael@0 1787 utext_openUnicodeString(&empty, &emptyString, &status);
michael@0 1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
michael@0 1789 utext_close(&empty);
michael@0 1790 }
michael@0 1791
michael@0 1792 if (U_SUCCESS(status)) {
michael@0 1793 reset();
michael@0 1794 while (find()) {
michael@0 1795 appendReplacement(dest, replacement, status);
michael@0 1796 if (U_FAILURE(status)) {
michael@0 1797 break;
michael@0 1798 }
michael@0 1799 }
michael@0 1800 appendTail(dest, status);
michael@0 1801 }
michael@0 1802
michael@0 1803 return dest;
michael@0 1804 }
michael@0 1805
michael@0 1806
michael@0 1807 //--------------------------------------------------------------------------------
michael@0 1808 //
michael@0 1809 // replaceFirst
michael@0 1810 //
michael@0 1811 //--------------------------------------------------------------------------------
michael@0 1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
michael@0 1813 UText replacementText = UTEXT_INITIALIZER;
michael@0 1814 UText resultText = UTEXT_INITIALIZER;
michael@0 1815 UnicodeString resultString;
michael@0 1816
michael@0 1817 utext_openConstUnicodeString(&replacementText, &replacement, &status);
michael@0 1818 utext_openUnicodeString(&resultText, &resultString, &status);
michael@0 1819
michael@0 1820 replaceFirst(&replacementText, &resultText, status);
michael@0 1821
michael@0 1822 utext_close(&resultText);
michael@0 1823 utext_close(&replacementText);
michael@0 1824
michael@0 1825 return resultString;
michael@0 1826 }
michael@0 1827
michael@0 1828 //
michael@0 1829 // replaceFirst, UText mode
michael@0 1830 //
michael@0 1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
michael@0 1832 if (U_FAILURE(status)) {
michael@0 1833 return dest;
michael@0 1834 }
michael@0 1835 if (U_FAILURE(fDeferredStatus)) {
michael@0 1836 status = fDeferredStatus;
michael@0 1837 return dest;
michael@0 1838 }
michael@0 1839
michael@0 1840 reset();
michael@0 1841 if (!find()) {
michael@0 1842 return getInput(dest, status);
michael@0 1843 }
michael@0 1844
michael@0 1845 if (dest == NULL) {
michael@0 1846 UnicodeString emptyString;
michael@0 1847 UText empty = UTEXT_INITIALIZER;
michael@0 1848
michael@0 1849 utext_openUnicodeString(&empty, &emptyString, &status);
michael@0 1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
michael@0 1851 utext_close(&empty);
michael@0 1852 }
michael@0 1853
michael@0 1854 appendReplacement(dest, replacement, status);
michael@0 1855 appendTail(dest, status);
michael@0 1856
michael@0 1857 return dest;
michael@0 1858 }
michael@0 1859
michael@0 1860
michael@0 1861 //--------------------------------------------------------------------------------
michael@0 1862 //
michael@0 1863 // requireEnd
michael@0 1864 //
michael@0 1865 //--------------------------------------------------------------------------------
michael@0 1866 UBool RegexMatcher::requireEnd() const {
michael@0 1867 return fRequireEnd;
michael@0 1868 }
michael@0 1869
michael@0 1870
michael@0 1871 //--------------------------------------------------------------------------------
michael@0 1872 //
michael@0 1873 // reset
michael@0 1874 //
michael@0 1875 //--------------------------------------------------------------------------------
michael@0 1876 RegexMatcher &RegexMatcher::reset() {
michael@0 1877 fRegionStart = 0;
michael@0 1878 fRegionLimit = fInputLength;
michael@0 1879 fActiveStart = 0;
michael@0 1880 fActiveLimit = fInputLength;
michael@0 1881 fAnchorStart = 0;
michael@0 1882 fAnchorLimit = fInputLength;
michael@0 1883 fLookStart = 0;
michael@0 1884 fLookLimit = fInputLength;
michael@0 1885 resetPreserveRegion();
michael@0 1886 return *this;
michael@0 1887 }
michael@0 1888
michael@0 1889
michael@0 1890
michael@0 1891 void RegexMatcher::resetPreserveRegion() {
michael@0 1892 fMatchStart = 0;
michael@0 1893 fMatchEnd = 0;
michael@0 1894 fLastMatchEnd = -1;
michael@0 1895 fAppendPosition = 0;
michael@0 1896 fMatch = FALSE;
michael@0 1897 fHitEnd = FALSE;
michael@0 1898 fRequireEnd = FALSE;
michael@0 1899 fTime = 0;
michael@0 1900 fTickCounter = TIMER_INITIAL_VALUE;
michael@0 1901 //resetStack(); // more expensive than it looks...
michael@0 1902 }
michael@0 1903
michael@0 1904
michael@0 1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
michael@0 1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
michael@0 1907 if (fPattern->fNeedsAltInput) {
michael@0 1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
michael@0 1909 }
michael@0 1910 fInputLength = utext_nativeLength(fInputText);
michael@0 1911
michael@0 1912 reset();
michael@0 1913 delete fInput;
michael@0 1914 fInput = NULL;
michael@0 1915
michael@0 1916 // Do the following for any UnicodeString.
michael@0 1917 // This is for compatibility for those clients who modify the input string "live" during regex operations.
michael@0 1918 fInputUniStrMaybeMutable = TRUE;
michael@0 1919
michael@0 1920 if (fWordBreakItr != NULL) {
michael@0 1921 #if UCONFIG_NO_BREAK_ITERATION==0
michael@0 1922 UErrorCode status = U_ZERO_ERROR;
michael@0 1923 fWordBreakItr->setText(fInputText, status);
michael@0 1924 #endif
michael@0 1925 }
michael@0 1926 return *this;
michael@0 1927 }
michael@0 1928
michael@0 1929
michael@0 1930 RegexMatcher &RegexMatcher::reset(UText *input) {
michael@0 1931 if (fInputText != input) {
michael@0 1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
michael@0 1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
michael@0 1934 fInputLength = utext_nativeLength(fInputText);
michael@0 1935
michael@0 1936 delete fInput;
michael@0 1937 fInput = NULL;
michael@0 1938
michael@0 1939 if (fWordBreakItr != NULL) {
michael@0 1940 #if UCONFIG_NO_BREAK_ITERATION==0
michael@0 1941 UErrorCode status = U_ZERO_ERROR;
michael@0 1942 fWordBreakItr->setText(input, status);
michael@0 1943 #endif
michael@0 1944 }
michael@0 1945 }
michael@0 1946 reset();
michael@0 1947 fInputUniStrMaybeMutable = FALSE;
michael@0 1948
michael@0 1949 return *this;
michael@0 1950 }
michael@0 1951
michael@0 1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
michael@0 1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
michael@0 1954 return *this;
michael@0 1955 }*/
michael@0 1956
michael@0 1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
michael@0 1958 if (U_FAILURE(status)) {
michael@0 1959 return *this;
michael@0 1960 }
michael@0 1961 reset(); // Reset also resets the region to be the entire string.
michael@0 1962
michael@0 1963 if (position < 0 || position > fActiveLimit) {
michael@0 1964 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1965 return *this;
michael@0 1966 }
michael@0 1967 fMatchEnd = position;
michael@0 1968 return *this;
michael@0 1969 }
michael@0 1970
michael@0 1971
michael@0 1972 //--------------------------------------------------------------------------------
michael@0 1973 //
michael@0 1974 // refresh
michael@0 1975 //
michael@0 1976 //--------------------------------------------------------------------------------
michael@0 1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
michael@0 1978 if (U_FAILURE(status)) {
michael@0 1979 return *this;
michael@0 1980 }
michael@0 1981 if (input == NULL) {
michael@0 1982 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1983 return *this;
michael@0 1984 }
michael@0 1985 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
michael@0 1986 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1987 return *this;
michael@0 1988 }
michael@0 1989 int64_t pos = utext_getNativeIndex(fInputText);
michael@0 1990 // Shallow read-only clone of the new UText into the existing input UText
michael@0 1991 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
michael@0 1992 if (U_FAILURE(status)) {
michael@0 1993 return *this;
michael@0 1994 }
michael@0 1995 utext_setNativeIndex(fInputText, pos);
michael@0 1996
michael@0 1997 if (fAltInputText != NULL) {
michael@0 1998 pos = utext_getNativeIndex(fAltInputText);
michael@0 1999 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
michael@0 2000 if (U_FAILURE(status)) {
michael@0 2001 return *this;
michael@0 2002 }
michael@0 2003 utext_setNativeIndex(fAltInputText, pos);
michael@0 2004 }
michael@0 2005 return *this;
michael@0 2006 }
michael@0 2007
michael@0 2008
michael@0 2009
michael@0 2010 //--------------------------------------------------------------------------------
michael@0 2011 //
michael@0 2012 // setTrace
michael@0 2013 //
michael@0 2014 //--------------------------------------------------------------------------------
michael@0 2015 void RegexMatcher::setTrace(UBool state) {
michael@0 2016 fTraceDebug = state;
michael@0 2017 }
michael@0 2018
michael@0 2019
michael@0 2020
michael@0 2021 //---------------------------------------------------------------------
michael@0 2022 //
michael@0 2023 // split
michael@0 2024 //
michael@0 2025 //---------------------------------------------------------------------
michael@0 2026 int32_t RegexMatcher::split(const UnicodeString &input,
michael@0 2027 UnicodeString dest[],
michael@0 2028 int32_t destCapacity,
michael@0 2029 UErrorCode &status)
michael@0 2030 {
michael@0 2031 UText inputText = UTEXT_INITIALIZER;
michael@0 2032 utext_openConstUnicodeString(&inputText, &input, &status);
michael@0 2033 if (U_FAILURE(status)) {
michael@0 2034 return 0;
michael@0 2035 }
michael@0 2036
michael@0 2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
michael@0 2038 if (destText == NULL) {
michael@0 2039 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2040 return 0;
michael@0 2041 }
michael@0 2042 int32_t i;
michael@0 2043 for (i = 0; i < destCapacity; i++) {
michael@0 2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
michael@0 2045 }
michael@0 2046
michael@0 2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
michael@0 2048
michael@0 2049 for (i = 0; i < destCapacity; i++) {
michael@0 2050 utext_close(destText[i]);
michael@0 2051 }
michael@0 2052
michael@0 2053 uprv_free(destText);
michael@0 2054 utext_close(&inputText);
michael@0 2055 return fieldCount;
michael@0 2056 }
michael@0 2057
michael@0 2058 //
michael@0 2059 // split, UText mode
michael@0 2060 //
michael@0 2061 int32_t RegexMatcher::split(UText *input,
michael@0 2062 UText *dest[],
michael@0 2063 int32_t destCapacity,
michael@0 2064 UErrorCode &status)
michael@0 2065 {
michael@0 2066 //
michael@0 2067 // Check arguements for validity
michael@0 2068 //
michael@0 2069 if (U_FAILURE(status)) {
michael@0 2070 return 0;
michael@0 2071 };
michael@0 2072
michael@0 2073 if (destCapacity < 1) {
michael@0 2074 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2075 return 0;
michael@0 2076 }
michael@0 2077
michael@0 2078 //
michael@0 2079 // Reset for the input text
michael@0 2080 //
michael@0 2081 reset(input);
michael@0 2082 int64_t nextOutputStringStart = 0;
michael@0 2083 if (fActiveLimit == 0) {
michael@0 2084 return 0;
michael@0 2085 }
michael@0 2086
michael@0 2087 //
michael@0 2088 // Loop through the input text, searching for the delimiter pattern
michael@0 2089 //
michael@0 2090 int32_t i;
michael@0 2091 int32_t numCaptureGroups = fPattern->fGroupMap->size();
michael@0 2092 for (i=0; ; i++) {
michael@0 2093 if (i>=destCapacity-1) {
michael@0 2094 // There is one or zero output string left.
michael@0 2095 // Fill the last output string with whatever is left from the input, then exit the loop.
michael@0 2096 // ( i will be == destCapacity if we filled the output array while processing
michael@0 2097 // capture groups of the delimiter expression, in which case we will discard the
michael@0 2098 // last capture group saved in favor of the unprocessed remainder of the
michael@0 2099 // input string.)
michael@0 2100 i = destCapacity-1;
michael@0 2101 if (fActiveLimit > nextOutputStringStart) {
michael@0 2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
michael@0 2103 if (dest[i]) {
michael@0 2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
michael@0 2105 input->chunkContents+nextOutputStringStart,
michael@0 2106 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
michael@0 2107 } else {
michael@0 2108 UText remainingText = UTEXT_INITIALIZER;
michael@0 2109 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
michael@0 2110 fActiveLimit-nextOutputStringStart, &status);
michael@0 2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2112 utext_close(&remainingText);
michael@0 2113 }
michael@0 2114 } else {
michael@0 2115 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 2116 int32_t remaining16Length =
michael@0 2117 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
michael@0 2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
michael@0 2119 if (remainingChars == NULL) {
michael@0 2120 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2121 break;
michael@0 2122 }
michael@0 2123
michael@0 2124 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
michael@0 2125 if (dest[i]) {
michael@0 2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
michael@0 2127 } else {
michael@0 2128 UText remainingText = UTEXT_INITIALIZER;
michael@0 2129 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
michael@0 2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2131 utext_close(&remainingText);
michael@0 2132 }
michael@0 2133
michael@0 2134 uprv_free(remainingChars);
michael@0 2135 }
michael@0 2136 }
michael@0 2137 break;
michael@0 2138 }
michael@0 2139 if (find()) {
michael@0 2140 // We found another delimiter. Move everything from where we started looking
michael@0 2141 // up until the start of the delimiter into the next output string.
michael@0 2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
michael@0 2143 if (dest[i]) {
michael@0 2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
michael@0 2145 input->chunkContents+nextOutputStringStart,
michael@0 2146 (int32_t)(fMatchStart-nextOutputStringStart), &status);
michael@0 2147 } else {
michael@0 2148 UText remainingText = UTEXT_INITIALIZER;
michael@0 2149 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
michael@0 2150 fMatchStart-nextOutputStringStart, &status);
michael@0 2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2152 utext_close(&remainingText);
michael@0 2153 }
michael@0 2154 } else {
michael@0 2155 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 2156 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
michael@0 2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
michael@0 2158 if (remainingChars == NULL) {
michael@0 2159 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2160 break;
michael@0 2161 }
michael@0 2162 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
michael@0 2163 if (dest[i]) {
michael@0 2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
michael@0 2165 } else {
michael@0 2166 UText remainingText = UTEXT_INITIALIZER;
michael@0 2167 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
michael@0 2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2169 utext_close(&remainingText);
michael@0 2170 }
michael@0 2171
michael@0 2172 uprv_free(remainingChars);
michael@0 2173 }
michael@0 2174 nextOutputStringStart = fMatchEnd;
michael@0 2175
michael@0 2176 // If the delimiter pattern has capturing parentheses, the captured
michael@0 2177 // text goes out into the next n destination strings.
michael@0 2178 int32_t groupNum;
michael@0 2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
michael@0 2180 if (i >= destCapacity-2) {
michael@0 2181 // Never fill the last available output string with capture group text.
michael@0 2182 // It will filled with the last field, the remainder of the
michael@0 2183 // unsplit input text.
michael@0 2184 break;
michael@0 2185 }
michael@0 2186 i++;
michael@0 2187 dest[i] = group(groupNum, dest[i], status);
michael@0 2188 }
michael@0 2189
michael@0 2190 if (nextOutputStringStart == fActiveLimit) {
michael@0 2191 // The delimiter was at the end of the string. We're done, but first
michael@0 2192 // we output one last empty string, for the empty field following
michael@0 2193 // the delimiter at the end of input.
michael@0 2194 if (i+1 < destCapacity) {
michael@0 2195 ++i;
michael@0 2196 if (dest[i] == NULL) {
michael@0 2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
michael@0 2198 } else {
michael@0 2199 static UChar emptyString[] = {(UChar)0};
michael@0 2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
michael@0 2201 }
michael@0 2202 }
michael@0 2203 break;
michael@0 2204
michael@0 2205 }
michael@0 2206 }
michael@0 2207 else
michael@0 2208 {
michael@0 2209 // We ran off the end of the input while looking for the next delimiter.
michael@0 2210 // All the remaining text goes into the current output string.
michael@0 2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
michael@0 2212 if (dest[i]) {
michael@0 2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
michael@0 2214 input->chunkContents+nextOutputStringStart,
michael@0 2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
michael@0 2216 } else {
michael@0 2217 UText remainingText = UTEXT_INITIALIZER;
michael@0 2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
michael@0 2219 fActiveLimit-nextOutputStringStart, &status);
michael@0 2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2221 utext_close(&remainingText);
michael@0 2222 }
michael@0 2223 } else {
michael@0 2224 UErrorCode lengthStatus = U_ZERO_ERROR;
michael@0 2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
michael@0 2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
michael@0 2227 if (remainingChars == NULL) {
michael@0 2228 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2229 break;
michael@0 2230 }
michael@0 2231
michael@0 2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
michael@0 2233 if (dest[i]) {
michael@0 2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
michael@0 2235 } else {
michael@0 2236 UText remainingText = UTEXT_INITIALIZER;
michael@0 2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
michael@0 2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
michael@0 2239 utext_close(&remainingText);
michael@0 2240 }
michael@0 2241
michael@0 2242 uprv_free(remainingChars);
michael@0 2243 }
michael@0 2244 break;
michael@0 2245 }
michael@0 2246 if (U_FAILURE(status)) {
michael@0 2247 break;
michael@0 2248 }
michael@0 2249 } // end of for loop
michael@0 2250 return i+1;
michael@0 2251 }
michael@0 2252
michael@0 2253
michael@0 2254 //--------------------------------------------------------------------------------
michael@0 2255 //
michael@0 2256 // start
michael@0 2257 //
michael@0 2258 //--------------------------------------------------------------------------------
michael@0 2259 int32_t RegexMatcher::start(UErrorCode &status) const {
michael@0 2260 return start(0, status);
michael@0 2261 }
michael@0 2262
michael@0 2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
michael@0 2264 return start64(0, status);
michael@0 2265 }
michael@0 2266
michael@0 2267 //--------------------------------------------------------------------------------
michael@0 2268 //
michael@0 2269 // start(int32_t group, UErrorCode &status)
michael@0 2270 //
michael@0 2271 //--------------------------------------------------------------------------------
michael@0 2272
michael@0 2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
michael@0 2274 if (U_FAILURE(status)) {
michael@0 2275 return -1;
michael@0 2276 }
michael@0 2277 if (U_FAILURE(fDeferredStatus)) {
michael@0 2278 status = fDeferredStatus;
michael@0 2279 return -1;
michael@0 2280 }
michael@0 2281 if (fMatch == FALSE) {
michael@0 2282 status = U_REGEX_INVALID_STATE;
michael@0 2283 return -1;
michael@0 2284 }
michael@0 2285 if (group < 0 || group > fPattern->fGroupMap->size()) {
michael@0 2286 status = U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2287 return -1;
michael@0 2288 }
michael@0 2289 int64_t s;
michael@0 2290 if (group == 0) {
michael@0 2291 s = fMatchStart;
michael@0 2292 } else {
michael@0 2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
michael@0 2294 U_ASSERT(groupOffset < fPattern->fFrameSize);
michael@0 2295 U_ASSERT(groupOffset >= 0);
michael@0 2296 s = fFrame->fExtra[groupOffset];
michael@0 2297 }
michael@0 2298
michael@0 2299 return s;
michael@0 2300 }
michael@0 2301
michael@0 2302
michael@0 2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
michael@0 2304 return (int32_t)start64(group, status);
michael@0 2305 }
michael@0 2306
michael@0 2307 //--------------------------------------------------------------------------------
michael@0 2308 //
michael@0 2309 // useAnchoringBounds
michael@0 2310 //
michael@0 2311 //--------------------------------------------------------------------------------
michael@0 2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
michael@0 2313 fAnchoringBounds = b;
michael@0 2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
michael@0 2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
michael@0 2316 return *this;
michael@0 2317 }
michael@0 2318
michael@0 2319
michael@0 2320 //--------------------------------------------------------------------------------
michael@0 2321 //
michael@0 2322 // useTransparentBounds
michael@0 2323 //
michael@0 2324 //--------------------------------------------------------------------------------
michael@0 2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
michael@0 2326 fTransparentBounds = b;
michael@0 2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
michael@0 2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
michael@0 2329 return *this;
michael@0 2330 }
michael@0 2331
michael@0 2332 //--------------------------------------------------------------------------------
michael@0 2333 //
michael@0 2334 // setTimeLimit
michael@0 2335 //
michael@0 2336 //--------------------------------------------------------------------------------
michael@0 2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
michael@0 2338 if (U_FAILURE(status)) {
michael@0 2339 return;
michael@0 2340 }
michael@0 2341 if (U_FAILURE(fDeferredStatus)) {
michael@0 2342 status = fDeferredStatus;
michael@0 2343 return;
michael@0 2344 }
michael@0 2345 if (limit < 0) {
michael@0 2346 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2347 return;
michael@0 2348 }
michael@0 2349 fTimeLimit = limit;
michael@0 2350 }
michael@0 2351
michael@0 2352
michael@0 2353 //--------------------------------------------------------------------------------
michael@0 2354 //
michael@0 2355 // getTimeLimit
michael@0 2356 //
michael@0 2357 //--------------------------------------------------------------------------------
michael@0 2358 int32_t RegexMatcher::getTimeLimit() const {
michael@0 2359 return fTimeLimit;
michael@0 2360 }
michael@0 2361
michael@0 2362
michael@0 2363 //--------------------------------------------------------------------------------
michael@0 2364 //
michael@0 2365 // setStackLimit
michael@0 2366 //
michael@0 2367 //--------------------------------------------------------------------------------
michael@0 2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
michael@0 2369 if (U_FAILURE(status)) {
michael@0 2370 return;
michael@0 2371 }
michael@0 2372 if (U_FAILURE(fDeferredStatus)) {
michael@0 2373 status = fDeferredStatus;
michael@0 2374 return;
michael@0 2375 }
michael@0 2376 if (limit < 0) {
michael@0 2377 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2378 return;
michael@0 2379 }
michael@0 2380
michael@0 2381 // Reset the matcher. This is needed here in case there is a current match
michael@0 2382 // whose final stack frame (containing the match results, pointed to by fFrame)
michael@0 2383 // would be lost by resizing to a smaller stack size.
michael@0 2384 reset();
michael@0 2385
michael@0 2386 if (limit == 0) {
michael@0 2387 // Unlimited stack expansion
michael@0 2388 fStack->setMaxCapacity(0);
michael@0 2389 } else {
michael@0 2390 // Change the units of the limit from bytes to ints, and bump the size up
michael@0 2391 // to be big enough to hold at least one stack frame for the pattern,
michael@0 2392 // if it isn't there already.
michael@0 2393 int32_t adjustedLimit = limit / sizeof(int32_t);
michael@0 2394 if (adjustedLimit < fPattern->fFrameSize) {
michael@0 2395 adjustedLimit = fPattern->fFrameSize;
michael@0 2396 }
michael@0 2397 fStack->setMaxCapacity(adjustedLimit);
michael@0 2398 }
michael@0 2399 fStackLimit = limit;
michael@0 2400 }
michael@0 2401
michael@0 2402
michael@0 2403 //--------------------------------------------------------------------------------
michael@0 2404 //
michael@0 2405 // getStackLimit
michael@0 2406 //
michael@0 2407 //--------------------------------------------------------------------------------
michael@0 2408 int32_t RegexMatcher::getStackLimit() const {
michael@0 2409 return fStackLimit;
michael@0 2410 }
michael@0 2411
michael@0 2412
michael@0 2413 //--------------------------------------------------------------------------------
michael@0 2414 //
michael@0 2415 // setMatchCallback
michael@0 2416 //
michael@0 2417 //--------------------------------------------------------------------------------
michael@0 2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
michael@0 2419 const void *context,
michael@0 2420 UErrorCode &status) {
michael@0 2421 if (U_FAILURE(status)) {
michael@0 2422 return;
michael@0 2423 }
michael@0 2424 fCallbackFn = callback;
michael@0 2425 fCallbackContext = context;
michael@0 2426 }
michael@0 2427
michael@0 2428
michael@0 2429 //--------------------------------------------------------------------------------
michael@0 2430 //
michael@0 2431 // getMatchCallback
michael@0 2432 //
michael@0 2433 //--------------------------------------------------------------------------------
michael@0 2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
michael@0 2435 const void *&context,
michael@0 2436 UErrorCode &status) {
michael@0 2437 if (U_FAILURE(status)) {
michael@0 2438 return;
michael@0 2439 }
michael@0 2440 callback = fCallbackFn;
michael@0 2441 context = fCallbackContext;
michael@0 2442 }
michael@0 2443
michael@0 2444
michael@0 2445 //--------------------------------------------------------------------------------
michael@0 2446 //
michael@0 2447 // setMatchCallback
michael@0 2448 //
michael@0 2449 //--------------------------------------------------------------------------------
michael@0 2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
michael@0 2451 const void *context,
michael@0 2452 UErrorCode &status) {
michael@0 2453 if (U_FAILURE(status)) {
michael@0 2454 return;
michael@0 2455 }
michael@0 2456 fFindProgressCallbackFn = callback;
michael@0 2457 fFindProgressCallbackContext = context;
michael@0 2458 }
michael@0 2459
michael@0 2460
michael@0 2461 //--------------------------------------------------------------------------------
michael@0 2462 //
michael@0 2463 // getMatchCallback
michael@0 2464 //
michael@0 2465 //--------------------------------------------------------------------------------
michael@0 2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
michael@0 2467 const void *&context,
michael@0 2468 UErrorCode &status) {
michael@0 2469 if (U_FAILURE(status)) {
michael@0 2470 return;
michael@0 2471 }
michael@0 2472 callback = fFindProgressCallbackFn;
michael@0 2473 context = fFindProgressCallbackContext;
michael@0 2474 }
michael@0 2475
michael@0 2476
michael@0 2477 //================================================================================
michael@0 2478 //
michael@0 2479 // Code following this point in this file is the internal
michael@0 2480 // Match Engine Implementation.
michael@0 2481 //
michael@0 2482 //================================================================================
michael@0 2483
michael@0 2484
michael@0 2485 //--------------------------------------------------------------------------------
michael@0 2486 //
michael@0 2487 // resetStack
michael@0 2488 // Discard any previous contents of the state save stack, and initialize a
michael@0 2489 // new stack frame to all -1. The -1s are needed for capture group limits,
michael@0 2490 // where they indicate that a group has not yet matched anything.
michael@0 2491 //--------------------------------------------------------------------------------
michael@0 2492 REStackFrame *RegexMatcher::resetStack() {
michael@0 2493 // Discard any previous contents of the state save stack, and initialize a
michael@0 2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
michael@0 2495 // where they indicate that a group has not yet matched anything.
michael@0 2496 fStack->removeAllElements();
michael@0 2497
michael@0 2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
michael@0 2499 int32_t i;
michael@0 2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
michael@0 2501 iFrame->fExtra[i] = -1;
michael@0 2502 }
michael@0 2503 return iFrame;
michael@0 2504 }
michael@0 2505
michael@0 2506
michael@0 2507
michael@0 2508 //--------------------------------------------------------------------------------
michael@0 2509 //
michael@0 2510 // isWordBoundary
michael@0 2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
michael@0 2512 // For us,
michael@0 2513 // If the current char is a combining mark,
michael@0 2514 // \b is FALSE.
michael@0 2515 // Else Scan backwards to the first non-combining char.
michael@0 2516 // We are at a boundary if the this char and the original chars are
michael@0 2517 // opposite in membership in \w set
michael@0 2518 //
michael@0 2519 // parameters: pos - the current position in the input buffer
michael@0 2520 //
michael@0 2521 // TODO: double-check edge cases at region boundaries.
michael@0 2522 //
michael@0 2523 //--------------------------------------------------------------------------------
michael@0 2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
michael@0 2525 UBool isBoundary = FALSE;
michael@0 2526 UBool cIsWord = FALSE;
michael@0 2527
michael@0 2528 if (pos >= fLookLimit) {
michael@0 2529 fHitEnd = TRUE;
michael@0 2530 } else {
michael@0 2531 // Determine whether char c at current position is a member of the word set of chars.
michael@0 2532 // If we're off the end of the string, behave as though we're not at a word char.
michael@0 2533 UTEXT_SETNATIVEINDEX(fInputText, pos);
michael@0 2534 UChar32 c = UTEXT_CURRENT32(fInputText);
michael@0 2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
michael@0 2536 // Current char is a combining one. Not a boundary.
michael@0 2537 return FALSE;
michael@0 2538 }
michael@0 2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
michael@0 2540 }
michael@0 2541
michael@0 2542 // Back up until we come to a non-combining char, determine whether
michael@0 2543 // that char is a word char.
michael@0 2544 UBool prevCIsWord = FALSE;
michael@0 2545 for (;;) {
michael@0 2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
michael@0 2547 break;
michael@0 2548 }
michael@0 2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
michael@0 2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
michael@0 2551 || u_charType(prevChar) == U_FORMAT_CHAR)) {
michael@0 2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
michael@0 2553 break;
michael@0 2554 }
michael@0 2555 }
michael@0 2556 isBoundary = cIsWord ^ prevCIsWord;
michael@0 2557 return isBoundary;
michael@0 2558 }
michael@0 2559
michael@0 2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
michael@0 2561 UBool isBoundary = FALSE;
michael@0 2562 UBool cIsWord = FALSE;
michael@0 2563
michael@0 2564 const UChar *inputBuf = fInputText->chunkContents;
michael@0 2565
michael@0 2566 if (pos >= fLookLimit) {
michael@0 2567 fHitEnd = TRUE;
michael@0 2568 } else {
michael@0 2569 // Determine whether char c at current position is a member of the word set of chars.
michael@0 2570 // If we're off the end of the string, behave as though we're not at a word char.
michael@0 2571 UChar32 c;
michael@0 2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
michael@0 2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
michael@0 2574 // Current char is a combining one. Not a boundary.
michael@0 2575 return FALSE;
michael@0 2576 }
michael@0 2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
michael@0 2578 }
michael@0 2579
michael@0 2580 // Back up until we come to a non-combining char, determine whether
michael@0 2581 // that char is a word char.
michael@0 2582 UBool prevCIsWord = FALSE;
michael@0 2583 for (;;) {
michael@0 2584 if (pos <= fLookStart) {
michael@0 2585 break;
michael@0 2586 }
michael@0 2587 UChar32 prevChar;
michael@0 2588 U16_PREV(inputBuf, fLookStart, pos, prevChar);
michael@0 2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
michael@0 2590 || u_charType(prevChar) == U_FORMAT_CHAR)) {
michael@0 2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
michael@0 2592 break;
michael@0 2593 }
michael@0 2594 }
michael@0 2595 isBoundary = cIsWord ^ prevCIsWord;
michael@0 2596 return isBoundary;
michael@0 2597 }
michael@0 2598
michael@0 2599 //--------------------------------------------------------------------------------
michael@0 2600 //
michael@0 2601 // isUWordBoundary
michael@0 2602 //
michael@0 2603 // Test for a word boundary using RBBI word break.
michael@0 2604 //
michael@0 2605 // parameters: pos - the current position in the input buffer
michael@0 2606 //
michael@0 2607 //--------------------------------------------------------------------------------
michael@0 2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
michael@0 2609 UBool returnVal = FALSE;
michael@0 2610 #if UCONFIG_NO_BREAK_ITERATION==0
michael@0 2611
michael@0 2612 // If we haven't yet created a break iterator for this matcher, do it now.
michael@0 2613 if (fWordBreakItr == NULL) {
michael@0 2614 fWordBreakItr =
michael@0 2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
michael@0 2616 if (U_FAILURE(fDeferredStatus)) {
michael@0 2617 return FALSE;
michael@0 2618 }
michael@0 2619 fWordBreakItr->setText(fInputText, fDeferredStatus);
michael@0 2620 }
michael@0 2621
michael@0 2622 if (pos >= fLookLimit) {
michael@0 2623 fHitEnd = TRUE;
michael@0 2624 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
michael@0 2625 // words are not boundaries. All non-word chars stand by themselves,
michael@0 2626 // with word boundaries on both sides.
michael@0 2627 } else {
michael@0 2628 if (!UTEXT_USES_U16(fInputText)) {
michael@0 2629 // !!!: Would like a better way to do this!
michael@0 2630 UErrorCode status = U_ZERO_ERROR;
michael@0 2631 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
michael@0 2632 }
michael@0 2633 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
michael@0 2634 }
michael@0 2635 #endif
michael@0 2636 return returnVal;
michael@0 2637 }
michael@0 2638
michael@0 2639 //--------------------------------------------------------------------------------
michael@0 2640 //
michael@0 2641 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
michael@0 2642 // saves. Increment the "time" counter, and call the
michael@0 2643 // user callback function if there is one installed.
michael@0 2644 //
michael@0 2645 // If the match operation needs to be aborted, either for a time-out
michael@0 2646 // or because the user callback asked for it, just set an error status.
michael@0 2647 // The engine will pick that up and stop in its outer loop.
michael@0 2648 //
michael@0 2649 //--------------------------------------------------------------------------------
michael@0 2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
michael@0 2651 fTickCounter = TIMER_INITIAL_VALUE;
michael@0 2652 fTime++;
michael@0 2653 if (fCallbackFn != NULL) {
michael@0 2654 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
michael@0 2655 status = U_REGEX_STOPPED_BY_CALLER;
michael@0 2656 return;
michael@0 2657 }
michael@0 2658 }
michael@0 2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
michael@0 2660 status = U_REGEX_TIME_OUT;
michael@0 2661 }
michael@0 2662 }
michael@0 2663
michael@0 2664 //--------------------------------------------------------------------------------
michael@0 2665 //
michael@0 2666 // ReportFindProgress This function is called once for each advance in the target
michael@0 2667 // string from the find() function, and calls the user progress callback
michael@0 2668 // function if there is one installed.
michael@0 2669 //
michael@0 2670 // NOTE:
michael@0 2671 //
michael@0 2672 // If the match operation needs to be aborted because the user
michael@0 2673 // callback asked for it, just set an error status.
michael@0 2674 // The engine will pick that up and stop in its outer loop.
michael@0 2675 //
michael@0 2676 //--------------------------------------------------------------------------------
michael@0 2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
michael@0 2678 if (fFindProgressCallbackFn != NULL) {
michael@0 2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
michael@0 2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
michael@0 2681 return FALSE;
michael@0 2682 }
michael@0 2683 }
michael@0 2684 return TRUE;
michael@0 2685 }
michael@0 2686
michael@0 2687 //--------------------------------------------------------------------------------
michael@0 2688 //
michael@0 2689 // StateSave
michael@0 2690 // Make a new stack frame, initialized as a copy of the current stack frame.
michael@0 2691 // Set the pattern index in the original stack frame from the operand value
michael@0 2692 // in the opcode. Execution of the engine continues with the state in
michael@0 2693 // the newly created stack frame
michael@0 2694 //
michael@0 2695 // Note that reserveBlock() may grow the stack, resulting in the
michael@0 2696 // whole thing being relocated in memory.
michael@0 2697 //
michael@0 2698 // Parameters:
michael@0 2699 // fp The top frame pointer when called. At return, a new
michael@0 2700 // fame will be present
michael@0 2701 // savePatIdx An index into the compiled pattern. Goes into the original
michael@0 2702 // (not new) frame. If execution ever back-tracks out of the
michael@0 2703 // new frame, this will be where we continue from in the pattern.
michael@0 2704 // Return
michael@0 2705 // The new frame pointer.
michael@0 2706 //
michael@0 2707 //--------------------------------------------------------------------------------
michael@0 2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
michael@0 2709 // push storage for a new frame.
michael@0 2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
michael@0 2711 if (newFP == NULL) {
michael@0 2712 // Failure on attempted stack expansion.
michael@0 2713 // Stack function set some other error code, change it to a more
michael@0 2714 // specific one for regular expressions.
michael@0 2715 status = U_REGEX_STACK_OVERFLOW;
michael@0 2716 // We need to return a writable stack frame, so just return the
michael@0 2717 // previous frame. The match operation will stop quickly
michael@0 2718 // because of the error status, after which the frame will never
michael@0 2719 // be looked at again.
michael@0 2720 return fp;
michael@0 2721 }
michael@0 2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
michael@0 2723
michael@0 2724 // New stack frame = copy of old top frame.
michael@0 2725 int64_t *source = (int64_t *)fp;
michael@0 2726 int64_t *dest = newFP;
michael@0 2727 for (;;) {
michael@0 2728 *dest++ = *source++;
michael@0 2729 if (source == newFP) {
michael@0 2730 break;
michael@0 2731 }
michael@0 2732 }
michael@0 2733
michael@0 2734 fTickCounter--;
michael@0 2735 if (fTickCounter <= 0) {
michael@0 2736 IncrementTime(status); // Re-initializes fTickCounter
michael@0 2737 }
michael@0 2738 fp->fPatIdx = savePatIdx;
michael@0 2739 return (REStackFrame *)newFP;
michael@0 2740 }
michael@0 2741
michael@0 2742
michael@0 2743 //--------------------------------------------------------------------------------
michael@0 2744 //
michael@0 2745 // MatchAt This is the actual matching engine.
michael@0 2746 //
michael@0 2747 // startIdx: begin matching a this index.
michael@0 2748 // toEnd: if true, match must extend to end of the input region
michael@0 2749 //
michael@0 2750 //--------------------------------------------------------------------------------
michael@0 2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
michael@0 2752 UBool isMatch = FALSE; // True if the we have a match.
michael@0 2753
michael@0 2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
michael@0 2755
michael@0 2756 int32_t op; // Operation from the compiled pattern, split into
michael@0 2757 int32_t opType; // the opcode
michael@0 2758 int32_t opValue; // and the operand value.
michael@0 2759
michael@0 2760 #ifdef REGEX_RUN_DEBUG
michael@0 2761 if (fTraceDebug)
michael@0 2762 {
michael@0 2763 printf("MatchAt(startIdx=%ld)\n", startIdx);
michael@0 2764 printf("Original Pattern: ");
michael@0 2765 UChar32 c = utext_next32From(fPattern->fPattern, 0);
michael@0 2766 while (c != U_SENTINEL) {
michael@0 2767 if (c<32 || c>256) {
michael@0 2768 c = '.';
michael@0 2769 }
michael@0 2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
michael@0 2771
michael@0 2772 c = UTEXT_NEXT32(fPattern->fPattern);
michael@0 2773 }
michael@0 2774 printf("\n");
michael@0 2775 printf("Input String: ");
michael@0 2776 c = utext_next32From(fInputText, 0);
michael@0 2777 while (c != U_SENTINEL) {
michael@0 2778 if (c<32 || c>256) {
michael@0 2779 c = '.';
michael@0 2780 }
michael@0 2781 printf("%c", c);
michael@0 2782
michael@0 2783 c = UTEXT_NEXT32(fInputText);
michael@0 2784 }
michael@0 2785 printf("\n");
michael@0 2786 printf("\n");
michael@0 2787 }
michael@0 2788 #endif
michael@0 2789
michael@0 2790 if (U_FAILURE(status)) {
michael@0 2791 return;
michael@0 2792 }
michael@0 2793
michael@0 2794 // Cache frequently referenced items from the compiled pattern
michael@0 2795 //
michael@0 2796 int64_t *pat = fPattern->fCompiledPat->getBuffer();
michael@0 2797
michael@0 2798 const UChar *litText = fPattern->fLiteralText.getBuffer();
michael@0 2799 UVector *sets = fPattern->fSets;
michael@0 2800
michael@0 2801 fFrameSize = fPattern->fFrameSize;
michael@0 2802 REStackFrame *fp = resetStack();
michael@0 2803
michael@0 2804 fp->fPatIdx = 0;
michael@0 2805 fp->fInputIdx = startIdx;
michael@0 2806
michael@0 2807 // Zero out the pattern's static data
michael@0 2808 int32_t i;
michael@0 2809 for (i = 0; i<fPattern->fDataSize; i++) {
michael@0 2810 fData[i] = 0;
michael@0 2811 }
michael@0 2812
michael@0 2813 //
michael@0 2814 // Main loop for interpreting the compiled pattern.
michael@0 2815 // One iteration of the loop per pattern operation performed.
michael@0 2816 //
michael@0 2817 for (;;) {
michael@0 2818 #if 0
michael@0 2819 if (_heapchk() != _HEAPOK) {
michael@0 2820 fprintf(stderr, "Heap Trouble\n");
michael@0 2821 }
michael@0 2822 #endif
michael@0 2823
michael@0 2824 op = (int32_t)pat[fp->fPatIdx];
michael@0 2825 opType = URX_TYPE(op);
michael@0 2826 opValue = URX_VAL(op);
michael@0 2827 #ifdef REGEX_RUN_DEBUG
michael@0 2828 if (fTraceDebug) {
michael@0 2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
michael@0 2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
michael@0 2832 fPattern->dumpOp(fp->fPatIdx);
michael@0 2833 }
michael@0 2834 #endif
michael@0 2835 fp->fPatIdx++;
michael@0 2836
michael@0 2837 switch (opType) {
michael@0 2838
michael@0 2839
michael@0 2840 case URX_NOP:
michael@0 2841 break;
michael@0 2842
michael@0 2843
michael@0 2844 case URX_BACKTRACK:
michael@0 2845 // Force a backtrack. In some circumstances, the pattern compiler
michael@0 2846 // will notice that the pattern can't possibly match anything, and will
michael@0 2847 // emit one of these at that point.
michael@0 2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 2849 break;
michael@0 2850
michael@0 2851
michael@0 2852 case URX_ONECHAR:
michael@0 2853 if (fp->fInputIdx < fActiveLimit) {
michael@0 2854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 2855 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 2856 if (c == opValue) {
michael@0 2857 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 2858 break;
michael@0 2859 }
michael@0 2860 } else {
michael@0 2861 fHitEnd = TRUE;
michael@0 2862 }
michael@0 2863 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 2864 break;
michael@0 2865
michael@0 2866
michael@0 2867 case URX_STRING:
michael@0 2868 {
michael@0 2869 // Test input against a literal string.
michael@0 2870 // Strings require two slots in the compiled pattern, one for the
michael@0 2871 // offset to the string text, and one for the length.
michael@0 2872
michael@0 2873 int32_t stringStartIdx = opValue;
michael@0 2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
michael@0 2875 fp->fPatIdx++;
michael@0 2876 opType = URX_TYPE(op);
michael@0 2877 int32_t stringLen = URX_VAL(op);
michael@0 2878 U_ASSERT(opType == URX_STRING_LEN);
michael@0 2879 U_ASSERT(stringLen >= 2);
michael@0 2880
michael@0 2881 const UChar *patternString = litText+stringStartIdx;
michael@0 2882 int32_t patternStringIndex = 0;
michael@0 2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 2884 UChar32 inputChar;
michael@0 2885 UChar32 patternChar;
michael@0 2886 UBool success = TRUE;
michael@0 2887 while (patternStringIndex < stringLen) {
michael@0 2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
michael@0 2889 success = FALSE;
michael@0 2890 fHitEnd = TRUE;
michael@0 2891 break;
michael@0 2892 }
michael@0 2893 inputChar = UTEXT_NEXT32(fInputText);
michael@0 2894 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
michael@0 2895 if (patternChar != inputChar) {
michael@0 2896 success = FALSE;
michael@0 2897 break;
michael@0 2898 }
michael@0 2899 }
michael@0 2900
michael@0 2901 if (success) {
michael@0 2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 2903 } else {
michael@0 2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 2905 }
michael@0 2906 }
michael@0 2907 break;
michael@0 2908
michael@0 2909
michael@0 2910 case URX_STATE_SAVE:
michael@0 2911 fp = StateSave(fp, opValue, status);
michael@0 2912 break;
michael@0 2913
michael@0 2914
michael@0 2915 case URX_END:
michael@0 2916 // The match loop will exit via this path on a successful match,
michael@0 2917 // when we reach the end of the pattern.
michael@0 2918 if (toEnd && fp->fInputIdx != fActiveLimit) {
michael@0 2919 // The pattern matched, but not to the end of input. Try some more.
michael@0 2920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 2921 break;
michael@0 2922 }
michael@0 2923 isMatch = TRUE;
michael@0 2924 goto breakFromLoop;
michael@0 2925
michael@0 2926 // Start and End Capture stack frame variables are laid out out like this:
michael@0 2927 // fp->fExtra[opValue] - The start of a completed capture group
michael@0 2928 // opValue+1 - The end of a completed capture group
michael@0 2929 // opValue+2 - the start of a capture group whose end
michael@0 2930 // has not yet been reached (and might not ever be).
michael@0 2931 case URX_START_CAPTURE:
michael@0 2932 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
michael@0 2933 fp->fExtra[opValue+2] = fp->fInputIdx;
michael@0 2934 break;
michael@0 2935
michael@0 2936
michael@0 2937 case URX_END_CAPTURE:
michael@0 2938 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
michael@0 2939 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
michael@0 2940 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
michael@0 2941 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
michael@0 2942 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
michael@0 2943 break;
michael@0 2944
michael@0 2945
michael@0 2946 case URX_DOLLAR: // $, test for End of line
michael@0 2947 // or for position before new line at end of input
michael@0 2948 {
michael@0 2949 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 2950 // We really are at the end of input. Success.
michael@0 2951 fHitEnd = TRUE;
michael@0 2952 fRequireEnd = TRUE;
michael@0 2953 break;
michael@0 2954 }
michael@0 2955
michael@0 2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 2957
michael@0 2958 // If we are positioned just before a new-line that is located at the
michael@0 2959 // end of input, succeed.
michael@0 2960 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
michael@0 2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
michael@0 2963 // If not in the middle of a CR/LF sequence
michael@0 2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
michael@0 2965 // At new-line at end of input. Success
michael@0 2966 fHitEnd = TRUE;
michael@0 2967 fRequireEnd = TRUE;
michael@0 2968
michael@0 2969 break;
michael@0 2970 }
michael@0 2971 }
michael@0 2972 } else {
michael@0 2973 UChar32 nextC = UTEXT_NEXT32(fInputText);
michael@0 2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
michael@0 2975 fHitEnd = TRUE;
michael@0 2976 fRequireEnd = TRUE;
michael@0 2977 break; // At CR/LF at end of input. Success
michael@0 2978 }
michael@0 2979 }
michael@0 2980
michael@0 2981 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 2982 }
michael@0 2983 break;
michael@0 2984
michael@0 2985
michael@0 2986 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
michael@0 2987 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 2988 // Off the end of input. Success.
michael@0 2989 fHitEnd = TRUE;
michael@0 2990 fRequireEnd = TRUE;
michael@0 2991 break;
michael@0 2992 } else {
michael@0 2993 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 2994 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 2995 // Either at the last character of input, or off the end.
michael@0 2996 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
michael@0 2997 fHitEnd = TRUE;
michael@0 2998 fRequireEnd = TRUE;
michael@0 2999 break;
michael@0 3000 }
michael@0 3001 }
michael@0 3002
michael@0 3003 // Not at end of input. Back-track out.
michael@0 3004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3005 break;
michael@0 3006
michael@0 3007
michael@0 3008 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
michael@0 3009 {
michael@0 3010 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 3011 // We really are at the end of input. Success.
michael@0 3012 fHitEnd = TRUE;
michael@0 3013 fRequireEnd = TRUE;
michael@0 3014 break;
michael@0 3015 }
michael@0 3016 // If we are positioned just before a new-line, succeed.
michael@0 3017 // It makes no difference where the new-line is within the input.
michael@0 3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3019 UChar32 c = UTEXT_CURRENT32(fInputText);
michael@0 3020 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
michael@0 3021 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
michael@0 3022 // In multi-line mode, hitting a new-line just before the end of input does not
michael@0 3023 // set the hitEnd or requireEnd flags
michael@0 3024 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
michael@0 3025 break;
michael@0 3026 }
michael@0 3027 }
michael@0 3028 // not at a new line. Fail.
michael@0 3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3030 }
michael@0 3031 break;
michael@0 3032
michael@0 3033
michael@0 3034 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
michael@0 3035 {
michael@0 3036 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 3037 // We really are at the end of input. Success.
michael@0 3038 fHitEnd = TRUE;
michael@0 3039 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
michael@0 3040 break; // adding a new-line would not lose the match.
michael@0 3041 }
michael@0 3042 // If we are not positioned just before a new-line, the test fails; backtrack out.
michael@0 3043 // It makes no difference where the new-line is within the input.
michael@0 3044 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3045 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
michael@0 3046 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3047 }
michael@0 3048 }
michael@0 3049 break;
michael@0 3050
michael@0 3051
michael@0 3052 case URX_CARET: // ^, test for start of line
michael@0 3053 if (fp->fInputIdx != fAnchorStart) {
michael@0 3054 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3055 }
michael@0 3056 break;
michael@0 3057
michael@0 3058
michael@0 3059 case URX_CARET_M: // ^, test for start of line in mulit-line mode
michael@0 3060 {
michael@0 3061 if (fp->fInputIdx == fAnchorStart) {
michael@0 3062 // We are at the start input. Success.
michael@0 3063 break;
michael@0 3064 }
michael@0 3065 // Check whether character just before the current pos is a new-line
michael@0 3066 // unless we are at the end of input
michael@0 3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3068 UChar32 c = UTEXT_PREVIOUS32(fInputText);
michael@0 3069 if ((fp->fInputIdx < fAnchorLimit) &&
michael@0 3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
michael@0 3071 // It's a new-line. ^ is true. Success.
michael@0 3072 // TODO: what should be done with positions between a CR and LF?
michael@0 3073 break;
michael@0 3074 }
michael@0 3075 // Not at the start of a line. Fail.
michael@0 3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3077 }
michael@0 3078 break;
michael@0 3079
michael@0 3080
michael@0 3081 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
michael@0 3082 {
michael@0 3083 U_ASSERT(fp->fInputIdx >= fAnchorStart);
michael@0 3084 if (fp->fInputIdx <= fAnchorStart) {
michael@0 3085 // We are at the start input. Success.
michael@0 3086 break;
michael@0 3087 }
michael@0 3088 // Check whether character just before the current pos is a new-line
michael@0 3089 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
michael@0 3090 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3091 UChar32 c = UTEXT_PREVIOUS32(fInputText);
michael@0 3092 if (c != 0x0a) {
michael@0 3093 // Not at the start of a line. Back-track out.
michael@0 3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3095 }
michael@0 3096 }
michael@0 3097 break;
michael@0 3098
michael@0 3099 case URX_BACKSLASH_B: // Test for word boundaries
michael@0 3100 {
michael@0 3101 UBool success = isWordBoundary(fp->fInputIdx);
michael@0 3102 success ^= (UBool)(opValue != 0); // flip sense for \B
michael@0 3103 if (!success) {
michael@0 3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3105 }
michael@0 3106 }
michael@0 3107 break;
michael@0 3108
michael@0 3109
michael@0 3110 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
michael@0 3111 {
michael@0 3112 UBool success = isUWordBoundary(fp->fInputIdx);
michael@0 3113 success ^= (UBool)(opValue != 0); // flip sense for \B
michael@0 3114 if (!success) {
michael@0 3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3116 }
michael@0 3117 }
michael@0 3118 break;
michael@0 3119
michael@0 3120
michael@0 3121 case URX_BACKSLASH_D: // Test for decimal digit
michael@0 3122 {
michael@0 3123 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3124 fHitEnd = TRUE;
michael@0 3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3126 break;
michael@0 3127 }
michael@0 3128
michael@0 3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3130
michael@0 3131 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3132 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
michael@0 3133 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
michael@0 3134 success ^= (UBool)(opValue != 0); // flip sense for \D
michael@0 3135 if (success) {
michael@0 3136 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3137 } else {
michael@0 3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3139 }
michael@0 3140 }
michael@0 3141 break;
michael@0 3142
michael@0 3143
michael@0 3144 case URX_BACKSLASH_G: // Test for position at end of previous match
michael@0 3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
michael@0 3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3147 }
michael@0 3148 break;
michael@0 3149
michael@0 3150
michael@0 3151 case URX_BACKSLASH_X:
michael@0 3152 // Match a Grapheme, as defined by Unicode TR 29.
michael@0 3153 // Differs slightly from Perl, which consumes combining marks independently
michael@0 3154 // of context.
michael@0 3155 {
michael@0 3156
michael@0 3157 // Fail if at end of input
michael@0 3158 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3159 fHitEnd = TRUE;
michael@0 3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3161 break;
michael@0 3162 }
michael@0 3163
michael@0 3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3165
michael@0 3166 // Examine (and consume) the current char.
michael@0 3167 // Dispatch into a little state machine, based on the char.
michael@0 3168 UChar32 c;
michael@0 3169 c = UTEXT_NEXT32(fInputText);
michael@0 3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3171 UnicodeSet **sets = fPattern->fStaticSets;
michael@0 3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
michael@0 3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
michael@0 3174 if (sets[URX_GC_L]->contains(c)) goto GC_L;
michael@0 3175 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
michael@0 3176 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
michael@0 3177 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 3178 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 3179 goto GC_Extend;
michael@0 3180
michael@0 3181
michael@0 3182
michael@0 3183 GC_L:
michael@0 3184 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 3185 c = UTEXT_NEXT32(fInputText);
michael@0 3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3187 if (sets[URX_GC_L]->contains(c)) goto GC_L;
michael@0 3188 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
michael@0 3189 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
michael@0 3190 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 3191 (void)UTEXT_PREVIOUS32(fInputText);
michael@0 3192 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3193 goto GC_Extend;
michael@0 3194
michael@0 3195 GC_V:
michael@0 3196 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 3197 c = UTEXT_NEXT32(fInputText);
michael@0 3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3199 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 3200 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 3201 (void)UTEXT_PREVIOUS32(fInputText);
michael@0 3202 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3203 goto GC_Extend;
michael@0 3204
michael@0 3205 GC_T:
michael@0 3206 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 3207 c = UTEXT_NEXT32(fInputText);
michael@0 3208 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3209 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 3210 (void)UTEXT_PREVIOUS32(fInputText);
michael@0 3211 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3212 goto GC_Extend;
michael@0 3213
michael@0 3214 GC_Extend:
michael@0 3215 // Combining characters are consumed here
michael@0 3216 for (;;) {
michael@0 3217 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3218 break;
michael@0 3219 }
michael@0 3220 c = UTEXT_CURRENT32(fInputText);
michael@0 3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
michael@0 3222 break;
michael@0 3223 }
michael@0 3224 (void)UTEXT_NEXT32(fInputText);
michael@0 3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3226 }
michael@0 3227 goto GC_Done;
michael@0 3228
michael@0 3229 GC_Control:
michael@0 3230 // Most control chars stand alone (don't combine with combining chars),
michael@0 3231 // except for that CR/LF sequence is a single grapheme cluster.
michael@0 3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
michael@0 3233 c = UTEXT_NEXT32(fInputText);
michael@0 3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3235 }
michael@0 3236
michael@0 3237 GC_Done:
michael@0 3238 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3239 fHitEnd = TRUE;
michael@0 3240 }
michael@0 3241 break;
michael@0 3242 }
michael@0 3243
michael@0 3244
michael@0 3245
michael@0 3246
michael@0 3247 case URX_BACKSLASH_Z: // Test for end of Input
michael@0 3248 if (fp->fInputIdx < fAnchorLimit) {
michael@0 3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3250 } else {
michael@0 3251 fHitEnd = TRUE;
michael@0 3252 fRequireEnd = TRUE;
michael@0 3253 }
michael@0 3254 break;
michael@0 3255
michael@0 3256
michael@0 3257
michael@0 3258 case URX_STATIC_SETREF:
michael@0 3259 {
michael@0 3260 // Test input character against one of the predefined sets
michael@0 3261 // (Word Characters, for example)
michael@0 3262 // The high bit of the op value is a flag for the match polarity.
michael@0 3263 // 0: success if input char is in set.
michael@0 3264 // 1: success if input char is not in set.
michael@0 3265 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3266 fHitEnd = TRUE;
michael@0 3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3268 break;
michael@0 3269 }
michael@0 3270
michael@0 3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
michael@0 3272 opValue &= ~URX_NEG_SET;
michael@0 3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
michael@0 3274
michael@0 3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3276 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3277 if (c < 256) {
michael@0 3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
michael@0 3279 if (s8->contains(c)) {
michael@0 3280 success = !success;
michael@0 3281 }
michael@0 3282 } else {
michael@0 3283 const UnicodeSet *s = fPattern->fStaticSets[opValue];
michael@0 3284 if (s->contains(c)) {
michael@0 3285 success = !success;
michael@0 3286 }
michael@0 3287 }
michael@0 3288 if (success) {
michael@0 3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3290 } else {
michael@0 3291 // the character wasn't in the set.
michael@0 3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3293 }
michael@0 3294 }
michael@0 3295 break;
michael@0 3296
michael@0 3297
michael@0 3298 case URX_STAT_SETREF_N:
michael@0 3299 {
michael@0 3300 // Test input character for NOT being a member of one of
michael@0 3301 // the predefined sets (Word Characters, for example)
michael@0 3302 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3303 fHitEnd = TRUE;
michael@0 3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3305 break;
michael@0 3306 }
michael@0 3307
michael@0 3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
michael@0 3309
michael@0 3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3311
michael@0 3312 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3313 if (c < 256) {
michael@0 3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
michael@0 3315 if (s8->contains(c) == FALSE) {
michael@0 3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3317 break;
michael@0 3318 }
michael@0 3319 } else {
michael@0 3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
michael@0 3321 if (s->contains(c) == FALSE) {
michael@0 3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3323 break;
michael@0 3324 }
michael@0 3325 }
michael@0 3326 // the character wasn't in the set.
michael@0 3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3328 }
michael@0 3329 break;
michael@0 3330
michael@0 3331
michael@0 3332 case URX_SETREF:
michael@0 3333 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3334 fHitEnd = TRUE;
michael@0 3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3336 break;
michael@0 3337 } else {
michael@0 3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3339
michael@0 3340 // There is input left. Pick up one char and test it for set membership.
michael@0 3341 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3342 U_ASSERT(opValue > 0 && opValue < sets->size());
michael@0 3343 if (c<256) {
michael@0 3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
michael@0 3345 if (s8->contains(c)) {
michael@0 3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3347 break;
michael@0 3348 }
michael@0 3349 } else {
michael@0 3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
michael@0 3351 if (s->contains(c)) {
michael@0 3352 // The character is in the set. A Match.
michael@0 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3354 break;
michael@0 3355 }
michael@0 3356 }
michael@0 3357
michael@0 3358 // the character wasn't in the set.
michael@0 3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3360 }
michael@0 3361 break;
michael@0 3362
michael@0 3363
michael@0 3364 case URX_DOTANY:
michael@0 3365 {
michael@0 3366 // . matches anything, but stops at end-of-line.
michael@0 3367 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3368 // At end of input. Match failed. Backtrack out.
michael@0 3369 fHitEnd = TRUE;
michael@0 3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3371 break;
michael@0 3372 }
michael@0 3373
michael@0 3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3375
michael@0 3376 // There is input left. Advance over one char, unless we've hit end-of-line
michael@0 3377 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
michael@0 3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
michael@0 3380 // End of line in normal mode. . does not match.
michael@0 3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3382 break;
michael@0 3383 }
michael@0 3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3385 }
michael@0 3386 break;
michael@0 3387
michael@0 3388
michael@0 3389 case URX_DOTANY_ALL:
michael@0 3390 {
michael@0 3391 // ., in dot-matches-all (including new lines) mode
michael@0 3392 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3393 // At end of input. Match failed. Backtrack out.
michael@0 3394 fHitEnd = TRUE;
michael@0 3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3396 break;
michael@0 3397 }
michael@0 3398
michael@0 3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3400
michael@0 3401 // There is input left. Advance over one char, except if we are
michael@0 3402 // at a cr/lf, advance over both of them.
michael@0 3403 UChar32 c;
michael@0 3404 c = UTEXT_NEXT32(fInputText);
michael@0 3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
michael@0 3407 // In the case of a CR/LF, we need to advance over both.
michael@0 3408 UChar32 nextc = UTEXT_CURRENT32(fInputText);
michael@0 3409 if (nextc == 0x0a) {
michael@0 3410 (void)UTEXT_NEXT32(fInputText);
michael@0 3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3412 }
michael@0 3413 }
michael@0 3414 }
michael@0 3415 break;
michael@0 3416
michael@0 3417
michael@0 3418 case URX_DOTANY_UNIX:
michael@0 3419 {
michael@0 3420 // '.' operator, matches all, but stops at end-of-line.
michael@0 3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
michael@0 3422 if (fp->fInputIdx >= fActiveLimit) {
michael@0 3423 // At end of input. Match failed. Backtrack out.
michael@0 3424 fHitEnd = TRUE;
michael@0 3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3426 break;
michael@0 3427 }
michael@0 3428
michael@0 3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3430
michael@0 3431 // There is input left. Advance over one char, unless we've hit end-of-line
michael@0 3432 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3433 if (c == 0x0a) {
michael@0 3434 // End of line in normal mode. '.' does not match the \n
michael@0 3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3436 } else {
michael@0 3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3438 }
michael@0 3439 }
michael@0 3440 break;
michael@0 3441
michael@0 3442
michael@0 3443 case URX_JMP:
michael@0 3444 fp->fPatIdx = opValue;
michael@0 3445 break;
michael@0 3446
michael@0 3447 case URX_FAIL:
michael@0 3448 isMatch = FALSE;
michael@0 3449 goto breakFromLoop;
michael@0 3450
michael@0 3451 case URX_JMP_SAV:
michael@0 3452 U_ASSERT(opValue < fPattern->fCompiledPat->size());
michael@0 3453 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
michael@0 3454 fp->fPatIdx = opValue; // Then JMP.
michael@0 3455 break;
michael@0 3456
michael@0 3457 case URX_JMP_SAV_X:
michael@0 3458 // This opcode is used with (x)+, when x can match a zero length string.
michael@0 3459 // Same as JMP_SAV, except conditional on the match having made forward progress.
michael@0 3460 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
michael@0 3461 // data address of the input position at the start of the loop.
michael@0 3462 {
michael@0 3463 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
michael@0 3464 int32_t stoOp = (int32_t)pat[opValue-1];
michael@0 3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
michael@0 3466 int32_t frameLoc = URX_VAL(stoOp);
michael@0 3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
michael@0 3468 int64_t prevInputIdx = fp->fExtra[frameLoc];
michael@0 3469 U_ASSERT(prevInputIdx <= fp->fInputIdx);
michael@0 3470 if (prevInputIdx < fp->fInputIdx) {
michael@0 3471 // The match did make progress. Repeat the loop.
michael@0 3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
michael@0 3473 fp->fPatIdx = opValue;
michael@0 3474 fp->fExtra[frameLoc] = fp->fInputIdx;
michael@0 3475 }
michael@0 3476 // If the input position did not advance, we do nothing here,
michael@0 3477 // execution will fall out of the loop.
michael@0 3478 }
michael@0 3479 break;
michael@0 3480
michael@0 3481 case URX_CTR_INIT:
michael@0 3482 {
michael@0 3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
michael@0 3484 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
michael@0 3485
michael@0 3486 // Pick up the three extra operands that CTR_INIT has, and
michael@0 3487 // skip the pattern location counter past
michael@0 3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 3489 fp->fPatIdx += 3;
michael@0 3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
michael@0 3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
michael@0 3493 U_ASSERT(minCount>=0);
michael@0 3494 U_ASSERT(maxCount>=minCount || maxCount==-1);
michael@0 3495 U_ASSERT(loopLoc>=fp->fPatIdx);
michael@0 3496
michael@0 3497 if (minCount == 0) {
michael@0 3498 fp = StateSave(fp, loopLoc+1, status);
michael@0 3499 }
michael@0 3500 if (maxCount == -1) {
michael@0 3501 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
michael@0 3502 } else if (maxCount == 0) {
michael@0 3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3504 }
michael@0 3505 }
michael@0 3506 break;
michael@0 3507
michael@0 3508 case URX_CTR_LOOP:
michael@0 3509 {
michael@0 3510 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
michael@0 3511 int32_t initOp = (int32_t)pat[opValue];
michael@0 3512 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
michael@0 3513 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
michael@0 3514 int32_t minCount = (int32_t)pat[opValue+2];
michael@0 3515 int32_t maxCount = (int32_t)pat[opValue+3];
michael@0 3516 (*pCounter)++;
michael@0 3517 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
michael@0 3518 U_ASSERT(*pCounter == maxCount);
michael@0 3519 break;
michael@0 3520 }
michael@0 3521 if (*pCounter >= minCount) {
michael@0 3522 if (maxCount == -1) {
michael@0 3523 // Loop has no hard upper bound.
michael@0 3524 // Check that it is progressing through the input, break if it is not.
michael@0 3525 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
michael@0 3526 if (fp->fInputIdx == *pLastInputIdx) {
michael@0 3527 break;
michael@0 3528 } else {
michael@0 3529 *pLastInputIdx = fp->fInputIdx;
michael@0 3530 }
michael@0 3531 }
michael@0 3532 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 3533 }
michael@0 3534 fp->fPatIdx = opValue + 4; // Loop back.
michael@0 3535 }
michael@0 3536 break;
michael@0 3537
michael@0 3538 case URX_CTR_INIT_NG:
michael@0 3539 {
michael@0 3540 // Initialize a non-greedy loop
michael@0 3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
michael@0 3542 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
michael@0 3543
michael@0 3544 // Pick up the three extra operands that CTR_INIT_NG has, and
michael@0 3545 // skip the pattern location counter past
michael@0 3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 3547 fp->fPatIdx += 3;
michael@0 3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
michael@0 3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
michael@0 3551 U_ASSERT(minCount>=0);
michael@0 3552 U_ASSERT(maxCount>=minCount || maxCount==-1);
michael@0 3553 U_ASSERT(loopLoc>fp->fPatIdx);
michael@0 3554 if (maxCount == -1) {
michael@0 3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
michael@0 3556 }
michael@0 3557
michael@0 3558 if (minCount == 0) {
michael@0 3559 if (maxCount != 0) {
michael@0 3560 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 3561 }
michael@0 3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
michael@0 3563 }
michael@0 3564 }
michael@0 3565 break;
michael@0 3566
michael@0 3567 case URX_CTR_LOOP_NG:
michael@0 3568 {
michael@0 3569 // Non-greedy {min, max} loops
michael@0 3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
michael@0 3571 int32_t initOp = (int32_t)pat[opValue];
michael@0 3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
michael@0 3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
michael@0 3574 int32_t minCount = (int32_t)pat[opValue+2];
michael@0 3575 int32_t maxCount = (int32_t)pat[opValue+3];
michael@0 3576
michael@0 3577 (*pCounter)++;
michael@0 3578 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
michael@0 3579 // The loop has matched the maximum permitted number of times.
michael@0 3580 // Break out of here with no action. Matching will
michael@0 3581 // continue with the following pattern.
michael@0 3582 U_ASSERT(*pCounter == maxCount);
michael@0 3583 break;
michael@0 3584 }
michael@0 3585
michael@0 3586 if (*pCounter < minCount) {
michael@0 3587 // We haven't met the minimum number of matches yet.
michael@0 3588 // Loop back for another one.
michael@0 3589 fp->fPatIdx = opValue + 4; // Loop back.
michael@0 3590 } else {
michael@0 3591 // We do have the minimum number of matches.
michael@0 3592
michael@0 3593 // If there is no upper bound on the loop iterations, check that the input index
michael@0 3594 // is progressing, and stop the loop if it is not.
michael@0 3595 if (maxCount == -1) {
michael@0 3596 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
michael@0 3597 if (fp->fInputIdx == *pLastInputIdx) {
michael@0 3598 break;
michael@0 3599 }
michael@0 3600 *pLastInputIdx = fp->fInputIdx;
michael@0 3601 }
michael@0 3602
michael@0 3603 // Loop Continuation: we will fall into the pattern following the loop
michael@0 3604 // (non-greedy, don't execute loop body first), but first do
michael@0 3605 // a state save to the top of the loop, so that a match failure
michael@0 3606 // in the following pattern will try another iteration of the loop.
michael@0 3607 fp = StateSave(fp, opValue + 4, status);
michael@0 3608 }
michael@0 3609 }
michael@0 3610 break;
michael@0 3611
michael@0 3612 case URX_STO_SP:
michael@0 3613 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
michael@0 3614 fData[opValue] = fStack->size();
michael@0 3615 break;
michael@0 3616
michael@0 3617 case URX_LD_SP:
michael@0 3618 {
michael@0 3619 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
michael@0 3620 int32_t newStackSize = (int32_t)fData[opValue];
michael@0 3621 U_ASSERT(newStackSize <= fStack->size());
michael@0 3622 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
michael@0 3623 if (newFP == (int64_t *)fp) {
michael@0 3624 break;
michael@0 3625 }
michael@0 3626 int32_t i;
michael@0 3627 for (i=0; i<fFrameSize; i++) {
michael@0 3628 newFP[i] = ((int64_t *)fp)[i];
michael@0 3629 }
michael@0 3630 fp = (REStackFrame *)newFP;
michael@0 3631 fStack->setSize(newStackSize);
michael@0 3632 }
michael@0 3633 break;
michael@0 3634
michael@0 3635 case URX_BACKREF:
michael@0 3636 {
michael@0 3637 U_ASSERT(opValue < fFrameSize);
michael@0 3638 int64_t groupStartIdx = fp->fExtra[opValue];
michael@0 3639 int64_t groupEndIdx = fp->fExtra[opValue+1];
michael@0 3640 U_ASSERT(groupStartIdx <= groupEndIdx);
michael@0 3641 if (groupStartIdx < 0) {
michael@0 3642 // This capture group has not participated in the match thus far,
michael@0 3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
michael@0 3644 break;
michael@0 3645 }
michael@0 3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
michael@0 3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3648
michael@0 3649 // Note: if the capture group match was of an empty string the backref
michael@0 3650 // match succeeds. Verified by testing: Perl matches succeed
michael@0 3651 // in this case, so we do too.
michael@0 3652
michael@0 3653 UBool success = TRUE;
michael@0 3654 for (;;) {
michael@0 3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
michael@0 3656 success = TRUE;
michael@0 3657 break;
michael@0 3658 }
michael@0 3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
michael@0 3660 success = FALSE;
michael@0 3661 fHitEnd = TRUE;
michael@0 3662 break;
michael@0 3663 }
michael@0 3664 UChar32 captureGroupChar = utext_next32(fAltInputText);
michael@0 3665 UChar32 inputChar = utext_next32(fInputText);
michael@0 3666 if (inputChar != captureGroupChar) {
michael@0 3667 success = FALSE;
michael@0 3668 break;
michael@0 3669 }
michael@0 3670 }
michael@0 3671
michael@0 3672 if (success) {
michael@0 3673 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3674 } else {
michael@0 3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3676 }
michael@0 3677 }
michael@0 3678 break;
michael@0 3679
michael@0 3680
michael@0 3681
michael@0 3682 case URX_BACKREF_I:
michael@0 3683 {
michael@0 3684 U_ASSERT(opValue < fFrameSize);
michael@0 3685 int64_t groupStartIdx = fp->fExtra[opValue];
michael@0 3686 int64_t groupEndIdx = fp->fExtra[opValue+1];
michael@0 3687 U_ASSERT(groupStartIdx <= groupEndIdx);
michael@0 3688 if (groupStartIdx < 0) {
michael@0 3689 // This capture group has not participated in the match thus far,
michael@0 3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
michael@0 3691 break;
michael@0 3692 }
michael@0 3693 utext_setNativeIndex(fAltInputText, groupStartIdx);
michael@0 3694 utext_setNativeIndex(fInputText, fp->fInputIdx);
michael@0 3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
michael@0 3696 CaseFoldingUTextIterator inputItr(*fInputText);
michael@0 3697
michael@0 3698 // Note: if the capture group match was of an empty string the backref
michael@0 3699 // match succeeds. Verified by testing: Perl matches succeed
michael@0 3700 // in this case, so we do too.
michael@0 3701
michael@0 3702 UBool success = TRUE;
michael@0 3703 for (;;) {
michael@0 3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
michael@0 3705 success = TRUE;
michael@0 3706 break;
michael@0 3707 }
michael@0 3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
michael@0 3709 success = FALSE;
michael@0 3710 fHitEnd = TRUE;
michael@0 3711 break;
michael@0 3712 }
michael@0 3713 UChar32 captureGroupChar = captureGroupItr.next();
michael@0 3714 UChar32 inputChar = inputItr.next();
michael@0 3715 if (inputChar != captureGroupChar) {
michael@0 3716 success = FALSE;
michael@0 3717 break;
michael@0 3718 }
michael@0 3719 }
michael@0 3720
michael@0 3721 if (success && inputItr.inExpansion()) {
michael@0 3722 // We otained a match by consuming part of a string obtained from
michael@0 3723 // case-folding a single code point of the input text.
michael@0 3724 // This does not count as an overall match.
michael@0 3725 success = FALSE;
michael@0 3726 }
michael@0 3727
michael@0 3728 if (success) {
michael@0 3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3730 } else {
michael@0 3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3732 }
michael@0 3733
michael@0 3734 }
michael@0 3735 break;
michael@0 3736
michael@0 3737 case URX_STO_INP_LOC:
michael@0 3738 {
michael@0 3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
michael@0 3740 fp->fExtra[opValue] = fp->fInputIdx;
michael@0 3741 }
michael@0 3742 break;
michael@0 3743
michael@0 3744 case URX_JMPX:
michael@0 3745 {
michael@0 3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 3747 fp->fPatIdx += 1;
michael@0 3748 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 3749 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
michael@0 3750 int64_t savedInputIdx = fp->fExtra[dataLoc];
michael@0 3751 U_ASSERT(savedInputIdx <= fp->fInputIdx);
michael@0 3752 if (savedInputIdx < fp->fInputIdx) {
michael@0 3753 fp->fPatIdx = opValue; // JMP
michael@0 3754 } else {
michael@0 3755 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
michael@0 3756 }
michael@0 3757 }
michael@0 3758 break;
michael@0 3759
michael@0 3760 case URX_LA_START:
michael@0 3761 {
michael@0 3762 // Entering a lookahead block.
michael@0 3763 // Save Stack Ptr, Input Pos.
michael@0 3764 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3765 fData[opValue] = fStack->size();
michael@0 3766 fData[opValue+1] = fp->fInputIdx;
michael@0 3767 fActiveStart = fLookStart; // Set the match region change for
michael@0 3768 fActiveLimit = fLookLimit; // transparent bounds.
michael@0 3769 }
michael@0 3770 break;
michael@0 3771
michael@0 3772 case URX_LA_END:
michael@0 3773 {
michael@0 3774 // Leaving a look-ahead block.
michael@0 3775 // restore Stack Ptr, Input Pos to positions they had on entry to block.
michael@0 3776 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3777 int32_t stackSize = fStack->size();
michael@0 3778 int32_t newStackSize =(int32_t)fData[opValue];
michael@0 3779 U_ASSERT(stackSize >= newStackSize);
michael@0 3780 if (stackSize > newStackSize) {
michael@0 3781 // Copy the current top frame back to the new (cut back) top frame.
michael@0 3782 // This makes the capture groups from within the look-ahead
michael@0 3783 // expression available.
michael@0 3784 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
michael@0 3785 int32_t i;
michael@0 3786 for (i=0; i<fFrameSize; i++) {
michael@0 3787 newFP[i] = ((int64_t *)fp)[i];
michael@0 3788 }
michael@0 3789 fp = (REStackFrame *)newFP;
michael@0 3790 fStack->setSize(newStackSize);
michael@0 3791 }
michael@0 3792 fp->fInputIdx = fData[opValue+1];
michael@0 3793
michael@0 3794 // Restore the active region bounds in the input string; they may have
michael@0 3795 // been changed because of transparent bounds on a Region.
michael@0 3796 fActiveStart = fRegionStart;
michael@0 3797 fActiveLimit = fRegionLimit;
michael@0 3798 }
michael@0 3799 break;
michael@0 3800
michael@0 3801 case URX_ONECHAR_I:
michael@0 3802 // Case insensitive one char. The char from the pattern is already case folded.
michael@0 3803 // Input text is not, but case folding the input can not reduce two or more code
michael@0 3804 // points to one.
michael@0 3805 if (fp->fInputIdx < fActiveLimit) {
michael@0 3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3807
michael@0 3808 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
michael@0 3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3811 break;
michael@0 3812 }
michael@0 3813 } else {
michael@0 3814 fHitEnd = TRUE;
michael@0 3815 }
michael@0 3816
michael@0 3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3818 break;
michael@0 3819
michael@0 3820 case URX_STRING_I:
michael@0 3821 {
michael@0 3822 // Case-insensitive test input against a literal string.
michael@0 3823 // Strings require two slots in the compiled pattern, one for the
michael@0 3824 // offset to the string text, and one for the length.
michael@0 3825 // The compiled string has already been case folded.
michael@0 3826 {
michael@0 3827 const UChar *patternString = litText + opValue;
michael@0 3828 int32_t patternStringIdx = 0;
michael@0 3829
michael@0 3830 op = (int32_t)pat[fp->fPatIdx];
michael@0 3831 fp->fPatIdx++;
michael@0 3832 opType = URX_TYPE(op);
michael@0 3833 opValue = URX_VAL(op);
michael@0 3834 U_ASSERT(opType == URX_STRING_LEN);
michael@0 3835 int32_t patternStringLen = opValue; // Length of the string from the pattern.
michael@0 3836
michael@0 3837
michael@0 3838 UChar32 cPattern;
michael@0 3839 UChar32 cText;
michael@0 3840 UBool success = TRUE;
michael@0 3841
michael@0 3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 3843 CaseFoldingUTextIterator inputIterator(*fInputText);
michael@0 3844 while (patternStringIdx < patternStringLen) {
michael@0 3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
michael@0 3846 success = FALSE;
michael@0 3847 fHitEnd = TRUE;
michael@0 3848 break;
michael@0 3849 }
michael@0 3850 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
michael@0 3851 cText = inputIterator.next();
michael@0 3852 if (cText != cPattern) {
michael@0 3853 success = FALSE;
michael@0 3854 break;
michael@0 3855 }
michael@0 3856 }
michael@0 3857 if (inputIterator.inExpansion()) {
michael@0 3858 success = FALSE;
michael@0 3859 }
michael@0 3860
michael@0 3861 if (success) {
michael@0 3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3863 } else {
michael@0 3864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3865 }
michael@0 3866 }
michael@0 3867 }
michael@0 3868 break;
michael@0 3869
michael@0 3870 case URX_LB_START:
michael@0 3871 {
michael@0 3872 // Entering a look-behind block.
michael@0 3873 // Save Stack Ptr, Input Pos.
michael@0 3874 // TODO: implement transparent bounds. Ticket #6067
michael@0 3875 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3876 fData[opValue] = fStack->size();
michael@0 3877 fData[opValue+1] = fp->fInputIdx;
michael@0 3878 // Init the variable containing the start index for attempted matches.
michael@0 3879 fData[opValue+2] = -1;
michael@0 3880 // Save input string length, then reset to pin any matches to end at
michael@0 3881 // the current position.
michael@0 3882 fData[opValue+3] = fActiveLimit;
michael@0 3883 fActiveLimit = fp->fInputIdx;
michael@0 3884 }
michael@0 3885 break;
michael@0 3886
michael@0 3887
michael@0 3888 case URX_LB_CONT:
michael@0 3889 {
michael@0 3890 // Positive Look-Behind, at top of loop checking for matches of LB expression
michael@0 3891 // at all possible input starting positions.
michael@0 3892
michael@0 3893 // Fetch the min and max possible match lengths. They are the operands
michael@0 3894 // of this op in the pattern.
michael@0 3895 int32_t minML = (int32_t)pat[fp->fPatIdx++];
michael@0 3896 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
michael@0 3897 U_ASSERT(minML <= maxML);
michael@0 3898 U_ASSERT(minML >= 0);
michael@0 3899
michael@0 3900 // Fetch (from data) the last input index where a match was attempted.
michael@0 3901 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3902 int64_t *lbStartIdx = &fData[opValue+2];
michael@0 3903 if (*lbStartIdx < 0) {
michael@0 3904 // First time through loop.
michael@0 3905 *lbStartIdx = fp->fInputIdx - minML;
michael@0 3906 } else {
michael@0 3907 // 2nd through nth time through the loop.
michael@0 3908 // Back up start position for match by one.
michael@0 3909 if (*lbStartIdx == 0) {
michael@0 3910 (*lbStartIdx)--;
michael@0 3911 } else {
michael@0 3912 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
michael@0 3913 (void)UTEXT_PREVIOUS32(fInputText);
michael@0 3914 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3915 }
michael@0 3916 }
michael@0 3917
michael@0 3918 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
michael@0 3919 // We have tried all potential match starting points without
michael@0 3920 // getting a match. Backtrack out, and out of the
michael@0 3921 // Look Behind altogether.
michael@0 3922 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3923 int64_t restoreInputLen = fData[opValue+3];
michael@0 3924 U_ASSERT(restoreInputLen >= fActiveLimit);
michael@0 3925 U_ASSERT(restoreInputLen <= fInputLength);
michael@0 3926 fActiveLimit = restoreInputLen;
michael@0 3927 break;
michael@0 3928 }
michael@0 3929
michael@0 3930 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
michael@0 3931 // (successful match will fall off the end of the loop.)
michael@0 3932 fp = StateSave(fp, fp->fPatIdx-3, status);
michael@0 3933 fp->fInputIdx = *lbStartIdx;
michael@0 3934 }
michael@0 3935 break;
michael@0 3936
michael@0 3937 case URX_LB_END:
michael@0 3938 // End of a look-behind block, after a successful match.
michael@0 3939 {
michael@0 3940 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3941 if (fp->fInputIdx != fActiveLimit) {
michael@0 3942 // The look-behind expression matched, but the match did not
michael@0 3943 // extend all the way to the point that we are looking behind from.
michael@0 3944 // FAIL out of here, which will take us back to the LB_CONT, which
michael@0 3945 // will retry the match starting at another position or fail
michael@0 3946 // the look-behind altogether, whichever is appropriate.
michael@0 3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 3948 break;
michael@0 3949 }
michael@0 3950
michael@0 3951 // Look-behind match is good. Restore the orignal input string length,
michael@0 3952 // which had been truncated to pin the end of the lookbehind match to the
michael@0 3953 // position being looked-behind.
michael@0 3954 int64_t originalInputLen = fData[opValue+3];
michael@0 3955 U_ASSERT(originalInputLen >= fActiveLimit);
michael@0 3956 U_ASSERT(originalInputLen <= fInputLength);
michael@0 3957 fActiveLimit = originalInputLen;
michael@0 3958 }
michael@0 3959 break;
michael@0 3960
michael@0 3961
michael@0 3962 case URX_LBN_CONT:
michael@0 3963 {
michael@0 3964 // Negative Look-Behind, at top of loop checking for matches of LB expression
michael@0 3965 // at all possible input starting positions.
michael@0 3966
michael@0 3967 // Fetch the extra parameters of this op.
michael@0 3968 int32_t minML = (int32_t)pat[fp->fPatIdx++];
michael@0 3969 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
michael@0 3970 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
michael@0 3971 continueLoc = URX_VAL(continueLoc);
michael@0 3972 U_ASSERT(minML <= maxML);
michael@0 3973 U_ASSERT(minML >= 0);
michael@0 3974 U_ASSERT(continueLoc > fp->fPatIdx);
michael@0 3975
michael@0 3976 // Fetch (from data) the last input index where a match was attempted.
michael@0 3977 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 3978 int64_t *lbStartIdx = &fData[opValue+2];
michael@0 3979 if (*lbStartIdx < 0) {
michael@0 3980 // First time through loop.
michael@0 3981 *lbStartIdx = fp->fInputIdx - minML;
michael@0 3982 } else {
michael@0 3983 // 2nd through nth time through the loop.
michael@0 3984 // Back up start position for match by one.
michael@0 3985 if (*lbStartIdx == 0) {
michael@0 3986 (*lbStartIdx)--;
michael@0 3987 } else {
michael@0 3988 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
michael@0 3989 (void)UTEXT_PREVIOUS32(fInputText);
michael@0 3990 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 3991 }
michael@0 3992 }
michael@0 3993
michael@0 3994 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
michael@0 3995 // We have tried all potential match starting points without
michael@0 3996 // getting a match, which means that the negative lookbehind as
michael@0 3997 // a whole has succeeded. Jump forward to the continue location
michael@0 3998 int64_t restoreInputLen = fData[opValue+3];
michael@0 3999 U_ASSERT(restoreInputLen >= fActiveLimit);
michael@0 4000 U_ASSERT(restoreInputLen <= fInputLength);
michael@0 4001 fActiveLimit = restoreInputLen;
michael@0 4002 fp->fPatIdx = continueLoc;
michael@0 4003 break;
michael@0 4004 }
michael@0 4005
michael@0 4006 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
michael@0 4007 // (successful match will cause a FAIL out of the loop altogether.)
michael@0 4008 fp = StateSave(fp, fp->fPatIdx-4, status);
michael@0 4009 fp->fInputIdx = *lbStartIdx;
michael@0 4010 }
michael@0 4011 break;
michael@0 4012
michael@0 4013 case URX_LBN_END:
michael@0 4014 // End of a negative look-behind block, after a successful match.
michael@0 4015 {
michael@0 4016 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 4017 if (fp->fInputIdx != fActiveLimit) {
michael@0 4018 // The look-behind expression matched, but the match did not
michael@0 4019 // extend all the way to the point that we are looking behind from.
michael@0 4020 // FAIL out of here, which will take us back to the LB_CONT, which
michael@0 4021 // will retry the match starting at another position or succeed
michael@0 4022 // the look-behind altogether, whichever is appropriate.
michael@0 4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4024 break;
michael@0 4025 }
michael@0 4026
michael@0 4027 // Look-behind expression matched, which means look-behind test as
michael@0 4028 // a whole Fails
michael@0 4029
michael@0 4030 // Restore the orignal input string length, which had been truncated
michael@0 4031 // inorder to pin the end of the lookbehind match
michael@0 4032 // to the position being looked-behind.
michael@0 4033 int64_t originalInputLen = fData[opValue+3];
michael@0 4034 U_ASSERT(originalInputLen >= fActiveLimit);
michael@0 4035 U_ASSERT(originalInputLen <= fInputLength);
michael@0 4036 fActiveLimit = originalInputLen;
michael@0 4037
michael@0 4038 // Restore original stack position, discarding any state saved
michael@0 4039 // by the successful pattern match.
michael@0 4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 4041 int32_t newStackSize = (int32_t)fData[opValue];
michael@0 4042 U_ASSERT(fStack->size() > newStackSize);
michael@0 4043 fStack->setSize(newStackSize);
michael@0 4044
michael@0 4045 // FAIL, which will take control back to someplace
michael@0 4046 // prior to entering the look-behind test.
michael@0 4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4048 }
michael@0 4049 break;
michael@0 4050
michael@0 4051
michael@0 4052 case URX_LOOP_SR_I:
michael@0 4053 // Loop Initialization for the optimized implementation of
michael@0 4054 // [some character set]*
michael@0 4055 // This op scans through all matching input.
michael@0 4056 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
michael@0 4057 {
michael@0 4058 U_ASSERT(opValue > 0 && opValue < sets->size());
michael@0 4059 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
michael@0 4060 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
michael@0 4061
michael@0 4062 // Loop through input, until either the input is exhausted or
michael@0 4063 // we reach a character that is not a member of the set.
michael@0 4064 int64_t ix = fp->fInputIdx;
michael@0 4065 UTEXT_SETNATIVEINDEX(fInputText, ix);
michael@0 4066 for (;;) {
michael@0 4067 if (ix >= fActiveLimit) {
michael@0 4068 fHitEnd = TRUE;
michael@0 4069 break;
michael@0 4070 }
michael@0 4071 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 4072 if (c<256) {
michael@0 4073 if (s8->contains(c) == FALSE) {
michael@0 4074 break;
michael@0 4075 }
michael@0 4076 } else {
michael@0 4077 if (s->contains(c) == FALSE) {
michael@0 4078 break;
michael@0 4079 }
michael@0 4080 }
michael@0 4081 ix = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 4082 }
michael@0 4083
michael@0 4084 // If there were no matching characters, skip over the loop altogether.
michael@0 4085 // The loop doesn't run at all, a * op always succeeds.
michael@0 4086 if (ix == fp->fInputIdx) {
michael@0 4087 fp->fPatIdx++; // skip the URX_LOOP_C op.
michael@0 4088 break;
michael@0 4089 }
michael@0 4090
michael@0 4091 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
michael@0 4092 // must follow. It's operand is the stack location
michael@0 4093 // that holds the starting input index for the match of this [set]*
michael@0 4094 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
michael@0 4095 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
michael@0 4096 int32_t stackLoc = URX_VAL(loopcOp);
michael@0 4097 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
michael@0 4098 fp->fExtra[stackLoc] = fp->fInputIdx;
michael@0 4099 fp->fInputIdx = ix;
michael@0 4100
michael@0 4101 // Save State to the URX_LOOP_C op that follows this one,
michael@0 4102 // so that match failures in the following code will return to there.
michael@0 4103 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
michael@0 4104 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 4105 fp->fPatIdx++;
michael@0 4106 }
michael@0 4107 break;
michael@0 4108
michael@0 4109
michael@0 4110 case URX_LOOP_DOT_I:
michael@0 4111 // Loop Initialization for the optimized implementation of .*
michael@0 4112 // This op scans through all remaining input.
michael@0 4113 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
michael@0 4114 {
michael@0 4115 // Loop through input until the input is exhausted (we reach an end-of-line)
michael@0 4116 // In DOTALL mode, we can just go straight to the end of the input.
michael@0 4117 int64_t ix;
michael@0 4118 if ((opValue & 1) == 1) {
michael@0 4119 // Dot-matches-All mode. Jump straight to the end of the string.
michael@0 4120 ix = fActiveLimit;
michael@0 4121 fHitEnd = TRUE;
michael@0 4122 } else {
michael@0 4123 // NOT DOT ALL mode. Line endings do not match '.'
michael@0 4124 // Scan forward until a line ending or end of input.
michael@0 4125 ix = fp->fInputIdx;
michael@0 4126 UTEXT_SETNATIVEINDEX(fInputText, ix);
michael@0 4127 for (;;) {
michael@0 4128 if (ix >= fActiveLimit) {
michael@0 4129 fHitEnd = TRUE;
michael@0 4130 break;
michael@0 4131 }
michael@0 4132 UChar32 c = UTEXT_NEXT32(fInputText);
michael@0 4133 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
michael@0 4134 if ((c == 0x0a) || // 0x0a is newline in both modes.
michael@0 4135 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
michael@0 4136 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
michael@0 4137 // char is a line ending. Exit the scanning loop.
michael@0 4138 break;
michael@0 4139 }
michael@0 4140 }
michael@0 4141 ix = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 4142 }
michael@0 4143 }
michael@0 4144
michael@0 4145 // If there were no matching characters, skip over the loop altogether.
michael@0 4146 // The loop doesn't run at all, a * op always succeeds.
michael@0 4147 if (ix == fp->fInputIdx) {
michael@0 4148 fp->fPatIdx++; // skip the URX_LOOP_C op.
michael@0 4149 break;
michael@0 4150 }
michael@0 4151
michael@0 4152 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
michael@0 4153 // must follow. It's operand is the stack location
michael@0 4154 // that holds the starting input index for the match of this .*
michael@0 4155 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
michael@0 4156 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
michael@0 4157 int32_t stackLoc = URX_VAL(loopcOp);
michael@0 4158 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
michael@0 4159 fp->fExtra[stackLoc] = fp->fInputIdx;
michael@0 4160 fp->fInputIdx = ix;
michael@0 4161
michael@0 4162 // Save State to the URX_LOOP_C op that follows this one,
michael@0 4163 // so that match failures in the following code will return to there.
michael@0 4164 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
michael@0 4165 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 4166 fp->fPatIdx++;
michael@0 4167 }
michael@0 4168 break;
michael@0 4169
michael@0 4170
michael@0 4171 case URX_LOOP_C:
michael@0 4172 {
michael@0 4173 U_ASSERT(opValue>=0 && opValue<fFrameSize);
michael@0 4174 backSearchIndex = fp->fExtra[opValue];
michael@0 4175 U_ASSERT(backSearchIndex <= fp->fInputIdx);
michael@0 4176 if (backSearchIndex == fp->fInputIdx) {
michael@0 4177 // We've backed up the input idx to the point that the loop started.
michael@0 4178 // The loop is done. Leave here without saving state.
michael@0 4179 // Subsequent failures won't come back here.
michael@0 4180 break;
michael@0 4181 }
michael@0 4182 // Set up for the next iteration of the loop, with input index
michael@0 4183 // backed up by one from the last time through,
michael@0 4184 // and a state save to this instruction in case the following code fails again.
michael@0 4185 // (We're going backwards because this loop emulates stack unwinding, not
michael@0 4186 // the initial scan forward.)
michael@0 4187 U_ASSERT(fp->fInputIdx > 0);
michael@0 4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
michael@0 4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 4191
michael@0 4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
michael@0 4193 if (prevC == 0x0a &&
michael@0 4194 fp->fInputIdx > backSearchIndex &&
michael@0 4195 twoPrevC == 0x0d) {
michael@0 4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
michael@0 4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
michael@0 4198 // .*, stepping back over CRLF pair.
michael@0 4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
michael@0 4200 }
michael@0 4201 }
michael@0 4202
michael@0 4203
michael@0 4204 fp = StateSave(fp, fp->fPatIdx-1, status);
michael@0 4205 }
michael@0 4206 break;
michael@0 4207
michael@0 4208
michael@0 4209
michael@0 4210 default:
michael@0 4211 // Trouble. The compiled pattern contains an entry with an
michael@0 4212 // unrecognized type tag.
michael@0 4213 U_ASSERT(FALSE);
michael@0 4214 }
michael@0 4215
michael@0 4216 if (U_FAILURE(status)) {
michael@0 4217 isMatch = FALSE;
michael@0 4218 break;
michael@0 4219 }
michael@0 4220 }
michael@0 4221
michael@0 4222 breakFromLoop:
michael@0 4223 fMatch = isMatch;
michael@0 4224 if (isMatch) {
michael@0 4225 fLastMatchEnd = fMatchEnd;
michael@0 4226 fMatchStart = startIdx;
michael@0 4227 fMatchEnd = fp->fInputIdx;
michael@0 4228 if (fTraceDebug) {
michael@0 4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
michael@0 4230 }
michael@0 4231 }
michael@0 4232 else
michael@0 4233 {
michael@0 4234 if (fTraceDebug) {
michael@0 4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
michael@0 4236 }
michael@0 4237 }
michael@0 4238
michael@0 4239 fFrame = fp; // The active stack frame when the engine stopped.
michael@0 4240 // Contains the capture group results that we need to
michael@0 4241 // access later.
michael@0 4242 return;
michael@0 4243 }
michael@0 4244
michael@0 4245
michael@0 4246 //--------------------------------------------------------------------------------
michael@0 4247 //
michael@0 4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
michael@0 4249 // assumption that the entire string is available in the UText's
michael@0 4250 // chunk buffer. For now, that means we can use int32_t indexes,
michael@0 4251 // except for anything that needs to be saved (like group starts
michael@0 4252 // and ends).
michael@0 4253 //
michael@0 4254 // startIdx: begin matching a this index.
michael@0 4255 // toEnd: if true, match must extend to end of the input region
michael@0 4256 //
michael@0 4257 //--------------------------------------------------------------------------------
michael@0 4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
michael@0 4259 UBool isMatch = FALSE; // True if the we have a match.
michael@0 4260
michael@0 4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
michael@0 4262
michael@0 4263 int32_t op; // Operation from the compiled pattern, split into
michael@0 4264 int32_t opType; // the opcode
michael@0 4265 int32_t opValue; // and the operand value.
michael@0 4266
michael@0 4267 #ifdef REGEX_RUN_DEBUG
michael@0 4268 if (fTraceDebug)
michael@0 4269 {
michael@0 4270 printf("MatchAt(startIdx=%d)\n", startIdx);
michael@0 4271 printf("Original Pattern: ");
michael@0 4272 UChar32 c = utext_next32From(fPattern->fPattern, 0);
michael@0 4273 while (c != U_SENTINEL) {
michael@0 4274 if (c<32 || c>256) {
michael@0 4275 c = '.';
michael@0 4276 }
michael@0 4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
michael@0 4278
michael@0 4279 c = UTEXT_NEXT32(fPattern->fPattern);
michael@0 4280 }
michael@0 4281 printf("\n");
michael@0 4282 printf("Input String: ");
michael@0 4283 c = utext_next32From(fInputText, 0);
michael@0 4284 while (c != U_SENTINEL) {
michael@0 4285 if (c<32 || c>256) {
michael@0 4286 c = '.';
michael@0 4287 }
michael@0 4288 printf("%c", c);
michael@0 4289
michael@0 4290 c = UTEXT_NEXT32(fInputText);
michael@0 4291 }
michael@0 4292 printf("\n");
michael@0 4293 printf("\n");
michael@0 4294 }
michael@0 4295 #endif
michael@0 4296
michael@0 4297 if (U_FAILURE(status)) {
michael@0 4298 return;
michael@0 4299 }
michael@0 4300
michael@0 4301 // Cache frequently referenced items from the compiled pattern
michael@0 4302 //
michael@0 4303 int64_t *pat = fPattern->fCompiledPat->getBuffer();
michael@0 4304
michael@0 4305 const UChar *litText = fPattern->fLiteralText.getBuffer();
michael@0 4306 UVector *sets = fPattern->fSets;
michael@0 4307
michael@0 4308 const UChar *inputBuf = fInputText->chunkContents;
michael@0 4309
michael@0 4310 fFrameSize = fPattern->fFrameSize;
michael@0 4311 REStackFrame *fp = resetStack();
michael@0 4312
michael@0 4313 fp->fPatIdx = 0;
michael@0 4314 fp->fInputIdx = startIdx;
michael@0 4315
michael@0 4316 // Zero out the pattern's static data
michael@0 4317 int32_t i;
michael@0 4318 for (i = 0; i<fPattern->fDataSize; i++) {
michael@0 4319 fData[i] = 0;
michael@0 4320 }
michael@0 4321
michael@0 4322 //
michael@0 4323 // Main loop for interpreting the compiled pattern.
michael@0 4324 // One iteration of the loop per pattern operation performed.
michael@0 4325 //
michael@0 4326 for (;;) {
michael@0 4327 #if 0
michael@0 4328 if (_heapchk() != _HEAPOK) {
michael@0 4329 fprintf(stderr, "Heap Trouble\n");
michael@0 4330 }
michael@0 4331 #endif
michael@0 4332
michael@0 4333 op = (int32_t)pat[fp->fPatIdx];
michael@0 4334 opType = URX_TYPE(op);
michael@0 4335 opValue = URX_VAL(op);
michael@0 4336 #ifdef REGEX_RUN_DEBUG
michael@0 4337 if (fTraceDebug) {
michael@0 4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
michael@0 4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
michael@0 4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
michael@0 4341 fPattern->dumpOp(fp->fPatIdx);
michael@0 4342 }
michael@0 4343 #endif
michael@0 4344 fp->fPatIdx++;
michael@0 4345
michael@0 4346 switch (opType) {
michael@0 4347
michael@0 4348
michael@0 4349 case URX_NOP:
michael@0 4350 break;
michael@0 4351
michael@0 4352
michael@0 4353 case URX_BACKTRACK:
michael@0 4354 // Force a backtrack. In some circumstances, the pattern compiler
michael@0 4355 // will notice that the pattern can't possibly match anything, and will
michael@0 4356 // emit one of these at that point.
michael@0 4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4358 break;
michael@0 4359
michael@0 4360
michael@0 4361 case URX_ONECHAR:
michael@0 4362 if (fp->fInputIdx < fActiveLimit) {
michael@0 4363 UChar32 c;
michael@0 4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4365 if (c == opValue) {
michael@0 4366 break;
michael@0 4367 }
michael@0 4368 } else {
michael@0 4369 fHitEnd = TRUE;
michael@0 4370 }
michael@0 4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4372 break;
michael@0 4373
michael@0 4374
michael@0 4375 case URX_STRING:
michael@0 4376 {
michael@0 4377 // Test input against a literal string.
michael@0 4378 // Strings require two slots in the compiled pattern, one for the
michael@0 4379 // offset to the string text, and one for the length.
michael@0 4380 int32_t stringStartIdx = opValue;
michael@0 4381 int32_t stringLen;
michael@0 4382
michael@0 4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
michael@0 4384 fp->fPatIdx++;
michael@0 4385 opType = URX_TYPE(op);
michael@0 4386 stringLen = URX_VAL(op);
michael@0 4387 U_ASSERT(opType == URX_STRING_LEN);
michael@0 4388 U_ASSERT(stringLen >= 2);
michael@0 4389
michael@0 4390 const UChar * pInp = inputBuf + fp->fInputIdx;
michael@0 4391 const UChar * pInpLimit = inputBuf + fActiveLimit;
michael@0 4392 const UChar * pPat = litText+stringStartIdx;
michael@0 4393 const UChar * pEnd = pInp + stringLen;
michael@0 4394 UBool success = TRUE;
michael@0 4395 while (pInp < pEnd) {
michael@0 4396 if (pInp >= pInpLimit) {
michael@0 4397 fHitEnd = TRUE;
michael@0 4398 success = FALSE;
michael@0 4399 break;
michael@0 4400 }
michael@0 4401 if (*pInp++ != *pPat++) {
michael@0 4402 success = FALSE;
michael@0 4403 break;
michael@0 4404 }
michael@0 4405 }
michael@0 4406
michael@0 4407 if (success) {
michael@0 4408 fp->fInputIdx += stringLen;
michael@0 4409 } else {
michael@0 4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4411 }
michael@0 4412 }
michael@0 4413 break;
michael@0 4414
michael@0 4415
michael@0 4416 case URX_STATE_SAVE:
michael@0 4417 fp = StateSave(fp, opValue, status);
michael@0 4418 break;
michael@0 4419
michael@0 4420
michael@0 4421 case URX_END:
michael@0 4422 // The match loop will exit via this path on a successful match,
michael@0 4423 // when we reach the end of the pattern.
michael@0 4424 if (toEnd && fp->fInputIdx != fActiveLimit) {
michael@0 4425 // The pattern matched, but not to the end of input. Try some more.
michael@0 4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4427 break;
michael@0 4428 }
michael@0 4429 isMatch = TRUE;
michael@0 4430 goto breakFromLoop;
michael@0 4431
michael@0 4432 // Start and End Capture stack frame variables are laid out out like this:
michael@0 4433 // fp->fExtra[opValue] - The start of a completed capture group
michael@0 4434 // opValue+1 - The end of a completed capture group
michael@0 4435 // opValue+2 - the start of a capture group whose end
michael@0 4436 // has not yet been reached (and might not ever be).
michael@0 4437 case URX_START_CAPTURE:
michael@0 4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
michael@0 4439 fp->fExtra[opValue+2] = fp->fInputIdx;
michael@0 4440 break;
michael@0 4441
michael@0 4442
michael@0 4443 case URX_END_CAPTURE:
michael@0 4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
michael@0 4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
michael@0 4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
michael@0 4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
michael@0 4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
michael@0 4449 break;
michael@0 4450
michael@0 4451
michael@0 4452 case URX_DOLLAR: // $, test for End of line
michael@0 4453 // or for position before new line at end of input
michael@0 4454 if (fp->fInputIdx < fAnchorLimit-2) {
michael@0 4455 // We are no where near the end of input. Fail.
michael@0 4456 // This is the common case. Keep it first.
michael@0 4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4458 break;
michael@0 4459 }
michael@0 4460 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 4461 // We really are at the end of input. Success.
michael@0 4462 fHitEnd = TRUE;
michael@0 4463 fRequireEnd = TRUE;
michael@0 4464 break;
michael@0 4465 }
michael@0 4466
michael@0 4467 // If we are positioned just before a new-line that is located at the
michael@0 4468 // end of input, succeed.
michael@0 4469 if (fp->fInputIdx == fAnchorLimit-1) {
michael@0 4470 UChar32 c;
michael@0 4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
michael@0 4472
michael@0 4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
michael@0 4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
michael@0 4475 // At new-line at end of input. Success
michael@0 4476 fHitEnd = TRUE;
michael@0 4477 fRequireEnd = TRUE;
michael@0 4478 break;
michael@0 4479 }
michael@0 4480 }
michael@0 4481 } else if (fp->fInputIdx == fAnchorLimit-2 &&
michael@0 4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
michael@0 4483 fHitEnd = TRUE;
michael@0 4484 fRequireEnd = TRUE;
michael@0 4485 break; // At CR/LF at end of input. Success
michael@0 4486 }
michael@0 4487
michael@0 4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4489
michael@0 4490 break;
michael@0 4491
michael@0 4492
michael@0 4493 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
michael@0 4494 if (fp->fInputIdx >= fAnchorLimit-1) {
michael@0 4495 // Either at the last character of input, or off the end.
michael@0 4496 if (fp->fInputIdx == fAnchorLimit-1) {
michael@0 4497 // At last char of input. Success if it's a new line.
michael@0 4498 if (inputBuf[fp->fInputIdx] == 0x0a) {
michael@0 4499 fHitEnd = TRUE;
michael@0 4500 fRequireEnd = TRUE;
michael@0 4501 break;
michael@0 4502 }
michael@0 4503 } else {
michael@0 4504 // Off the end of input. Success.
michael@0 4505 fHitEnd = TRUE;
michael@0 4506 fRequireEnd = TRUE;
michael@0 4507 break;
michael@0 4508 }
michael@0 4509 }
michael@0 4510
michael@0 4511 // Not at end of input. Back-track out.
michael@0 4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4513 break;
michael@0 4514
michael@0 4515
michael@0 4516 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
michael@0 4517 {
michael@0 4518 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 4519 // We really are at the end of input. Success.
michael@0 4520 fHitEnd = TRUE;
michael@0 4521 fRequireEnd = TRUE;
michael@0 4522 break;
michael@0 4523 }
michael@0 4524 // If we are positioned just before a new-line, succeed.
michael@0 4525 // It makes no difference where the new-line is within the input.
michael@0 4526 UChar32 c = inputBuf[fp->fInputIdx];
michael@0 4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
michael@0 4528 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
michael@0 4529 // In multi-line mode, hitting a new-line just before the end of input does not
michael@0 4530 // set the hitEnd or requireEnd flags
michael@0 4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
michael@0 4532 break;
michael@0 4533 }
michael@0 4534 }
michael@0 4535 // not at a new line. Fail.
michael@0 4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4537 }
michael@0 4538 break;
michael@0 4539
michael@0 4540
michael@0 4541 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
michael@0 4542 {
michael@0 4543 if (fp->fInputIdx >= fAnchorLimit) {
michael@0 4544 // We really are at the end of input. Success.
michael@0 4545 fHitEnd = TRUE;
michael@0 4546 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
michael@0 4547 break; // adding a new-line would not lose the match.
michael@0 4548 }
michael@0 4549 // If we are not positioned just before a new-line, the test fails; backtrack out.
michael@0 4550 // It makes no difference where the new-line is within the input.
michael@0 4551 if (inputBuf[fp->fInputIdx] != 0x0a) {
michael@0 4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4553 }
michael@0 4554 }
michael@0 4555 break;
michael@0 4556
michael@0 4557
michael@0 4558 case URX_CARET: // ^, test for start of line
michael@0 4559 if (fp->fInputIdx != fAnchorStart) {
michael@0 4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4561 }
michael@0 4562 break;
michael@0 4563
michael@0 4564
michael@0 4565 case URX_CARET_M: // ^, test for start of line in mulit-line mode
michael@0 4566 {
michael@0 4567 if (fp->fInputIdx == fAnchorStart) {
michael@0 4568 // We are at the start input. Success.
michael@0 4569 break;
michael@0 4570 }
michael@0 4571 // Check whether character just before the current pos is a new-line
michael@0 4572 // unless we are at the end of input
michael@0 4573 UChar c = inputBuf[fp->fInputIdx - 1];
michael@0 4574 if ((fp->fInputIdx < fAnchorLimit) &&
michael@0 4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
michael@0 4576 // It's a new-line. ^ is true. Success.
michael@0 4577 // TODO: what should be done with positions between a CR and LF?
michael@0 4578 break;
michael@0 4579 }
michael@0 4580 // Not at the start of a line. Fail.
michael@0 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4582 }
michael@0 4583 break;
michael@0 4584
michael@0 4585
michael@0 4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
michael@0 4587 {
michael@0 4588 U_ASSERT(fp->fInputIdx >= fAnchorStart);
michael@0 4589 if (fp->fInputIdx <= fAnchorStart) {
michael@0 4590 // We are at the start input. Success.
michael@0 4591 break;
michael@0 4592 }
michael@0 4593 // Check whether character just before the current pos is a new-line
michael@0 4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
michael@0 4595 UChar c = inputBuf[fp->fInputIdx - 1];
michael@0 4596 if (c != 0x0a) {
michael@0 4597 // Not at the start of a line. Back-track out.
michael@0 4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4599 }
michael@0 4600 }
michael@0 4601 break;
michael@0 4602
michael@0 4603 case URX_BACKSLASH_B: // Test for word boundaries
michael@0 4604 {
michael@0 4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
michael@0 4606 success ^= (UBool)(opValue != 0); // flip sense for \B
michael@0 4607 if (!success) {
michael@0 4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4609 }
michael@0 4610 }
michael@0 4611 break;
michael@0 4612
michael@0 4613
michael@0 4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
michael@0 4615 {
michael@0 4616 UBool success = isUWordBoundary(fp->fInputIdx);
michael@0 4617 success ^= (UBool)(opValue != 0); // flip sense for \B
michael@0 4618 if (!success) {
michael@0 4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4620 }
michael@0 4621 }
michael@0 4622 break;
michael@0 4623
michael@0 4624
michael@0 4625 case URX_BACKSLASH_D: // Test for decimal digit
michael@0 4626 {
michael@0 4627 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4628 fHitEnd = TRUE;
michael@0 4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4630 break;
michael@0 4631 }
michael@0 4632
michael@0 4633 UChar32 c;
michael@0 4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4635 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
michael@0 4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
michael@0 4637 success ^= (UBool)(opValue != 0); // flip sense for \D
michael@0 4638 if (!success) {
michael@0 4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4640 }
michael@0 4641 }
michael@0 4642 break;
michael@0 4643
michael@0 4644
michael@0 4645 case URX_BACKSLASH_G: // Test for position at end of previous match
michael@0 4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
michael@0 4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4648 }
michael@0 4649 break;
michael@0 4650
michael@0 4651
michael@0 4652 case URX_BACKSLASH_X:
michael@0 4653 // Match a Grapheme, as defined by Unicode TR 29.
michael@0 4654 // Differs slightly from Perl, which consumes combining marks independently
michael@0 4655 // of context.
michael@0 4656 {
michael@0 4657
michael@0 4658 // Fail if at end of input
michael@0 4659 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4660 fHitEnd = TRUE;
michael@0 4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4662 break;
michael@0 4663 }
michael@0 4664
michael@0 4665 // Examine (and consume) the current char.
michael@0 4666 // Dispatch into a little state machine, based on the char.
michael@0 4667 UChar32 c;
michael@0 4668 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4669 UnicodeSet **sets = fPattern->fStaticSets;
michael@0 4670 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
michael@0 4671 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
michael@0 4672 if (sets[URX_GC_L]->contains(c)) goto GC_L;
michael@0 4673 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
michael@0 4674 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
michael@0 4675 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 4676 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 4677 goto GC_Extend;
michael@0 4678
michael@0 4679
michael@0 4680
michael@0 4681 GC_L:
michael@0 4682 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 4683 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4684 if (sets[URX_GC_L]->contains(c)) goto GC_L;
michael@0 4685 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
michael@0 4686 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
michael@0 4687 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 4688 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
michael@0 4689 goto GC_Extend;
michael@0 4690
michael@0 4691 GC_V:
michael@0 4692 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 4693 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4694 if (sets[URX_GC_V]->contains(c)) goto GC_V;
michael@0 4695 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 4696 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
michael@0 4697 goto GC_Extend;
michael@0 4698
michael@0 4699 GC_T:
michael@0 4700 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
michael@0 4701 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4702 if (sets[URX_GC_T]->contains(c)) goto GC_T;
michael@0 4703 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
michael@0 4704 goto GC_Extend;
michael@0 4705
michael@0 4706 GC_Extend:
michael@0 4707 // Combining characters are consumed here
michael@0 4708 for (;;) {
michael@0 4709 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4710 break;
michael@0 4711 }
michael@0 4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
michael@0 4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
michael@0 4715 break;
michael@0 4716 }
michael@0 4717 }
michael@0 4718 goto GC_Done;
michael@0 4719
michael@0 4720 GC_Control:
michael@0 4721 // Most control chars stand alone (don't combine with combining chars),
michael@0 4722 // except for that CR/LF sequence is a single grapheme cluster.
michael@0 4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
michael@0 4724 fp->fInputIdx++;
michael@0 4725 }
michael@0 4726
michael@0 4727 GC_Done:
michael@0 4728 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4729 fHitEnd = TRUE;
michael@0 4730 }
michael@0 4731 break;
michael@0 4732 }
michael@0 4733
michael@0 4734
michael@0 4735
michael@0 4736
michael@0 4737 case URX_BACKSLASH_Z: // Test for end of Input
michael@0 4738 if (fp->fInputIdx < fAnchorLimit) {
michael@0 4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4740 } else {
michael@0 4741 fHitEnd = TRUE;
michael@0 4742 fRequireEnd = TRUE;
michael@0 4743 }
michael@0 4744 break;
michael@0 4745
michael@0 4746
michael@0 4747
michael@0 4748 case URX_STATIC_SETREF:
michael@0 4749 {
michael@0 4750 // Test input character against one of the predefined sets
michael@0 4751 // (Word Characters, for example)
michael@0 4752 // The high bit of the op value is a flag for the match polarity.
michael@0 4753 // 0: success if input char is in set.
michael@0 4754 // 1: success if input char is not in set.
michael@0 4755 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4756 fHitEnd = TRUE;
michael@0 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4758 break;
michael@0 4759 }
michael@0 4760
michael@0 4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
michael@0 4762 opValue &= ~URX_NEG_SET;
michael@0 4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
michael@0 4764
michael@0 4765 UChar32 c;
michael@0 4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4767 if (c < 256) {
michael@0 4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
michael@0 4769 if (s8->contains(c)) {
michael@0 4770 success = !success;
michael@0 4771 }
michael@0 4772 } else {
michael@0 4773 const UnicodeSet *s = fPattern->fStaticSets[opValue];
michael@0 4774 if (s->contains(c)) {
michael@0 4775 success = !success;
michael@0 4776 }
michael@0 4777 }
michael@0 4778 if (!success) {
michael@0 4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4780 }
michael@0 4781 }
michael@0 4782 break;
michael@0 4783
michael@0 4784
michael@0 4785 case URX_STAT_SETREF_N:
michael@0 4786 {
michael@0 4787 // Test input character for NOT being a member of one of
michael@0 4788 // the predefined sets (Word Characters, for example)
michael@0 4789 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4790 fHitEnd = TRUE;
michael@0 4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4792 break;
michael@0 4793 }
michael@0 4794
michael@0 4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
michael@0 4796
michael@0 4797 UChar32 c;
michael@0 4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4799 if (c < 256) {
michael@0 4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
michael@0 4801 if (s8->contains(c) == FALSE) {
michael@0 4802 break;
michael@0 4803 }
michael@0 4804 } else {
michael@0 4805 const UnicodeSet *s = fPattern->fStaticSets[opValue];
michael@0 4806 if (s->contains(c) == FALSE) {
michael@0 4807 break;
michael@0 4808 }
michael@0 4809 }
michael@0 4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4811 }
michael@0 4812 break;
michael@0 4813
michael@0 4814
michael@0 4815 case URX_SETREF:
michael@0 4816 {
michael@0 4817 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4818 fHitEnd = TRUE;
michael@0 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4820 break;
michael@0 4821 }
michael@0 4822
michael@0 4823 U_ASSERT(opValue > 0 && opValue < sets->size());
michael@0 4824
michael@0 4825 // There is input left. Pick up one char and test it for set membership.
michael@0 4826 UChar32 c;
michael@0 4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4828 if (c<256) {
michael@0 4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
michael@0 4830 if (s8->contains(c)) {
michael@0 4831 // The character is in the set. A Match.
michael@0 4832 break;
michael@0 4833 }
michael@0 4834 } else {
michael@0 4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
michael@0 4836 if (s->contains(c)) {
michael@0 4837 // The character is in the set. A Match.
michael@0 4838 break;
michael@0 4839 }
michael@0 4840 }
michael@0 4841
michael@0 4842 // the character wasn't in the set.
michael@0 4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4844 }
michael@0 4845 break;
michael@0 4846
michael@0 4847
michael@0 4848 case URX_DOTANY:
michael@0 4849 {
michael@0 4850 // . matches anything, but stops at end-of-line.
michael@0 4851 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4852 // At end of input. Match failed. Backtrack out.
michael@0 4853 fHitEnd = TRUE;
michael@0 4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4855 break;
michael@0 4856 }
michael@0 4857
michael@0 4858 // There is input left. Advance over one char, unless we've hit end-of-line
michael@0 4859 UChar32 c;
michael@0 4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
michael@0 4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
michael@0 4863 // End of line in normal mode. . does not match.
michael@0 4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4865 break;
michael@0 4866 }
michael@0 4867 }
michael@0 4868 break;
michael@0 4869
michael@0 4870
michael@0 4871 case URX_DOTANY_ALL:
michael@0 4872 {
michael@0 4873 // . in dot-matches-all (including new lines) mode
michael@0 4874 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4875 // At end of input. Match failed. Backtrack out.
michael@0 4876 fHitEnd = TRUE;
michael@0 4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4878 break;
michael@0 4879 }
michael@0 4880
michael@0 4881 // There is input left. Advance over one char, except if we are
michael@0 4882 // at a cr/lf, advance over both of them.
michael@0 4883 UChar32 c;
michael@0 4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
michael@0 4886 // In the case of a CR/LF, we need to advance over both.
michael@0 4887 if (inputBuf[fp->fInputIdx] == 0x0a) {
michael@0 4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
michael@0 4889 }
michael@0 4890 }
michael@0 4891 }
michael@0 4892 break;
michael@0 4893
michael@0 4894
michael@0 4895 case URX_DOTANY_UNIX:
michael@0 4896 {
michael@0 4897 // '.' operator, matches all, but stops at end-of-line.
michael@0 4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
michael@0 4899 if (fp->fInputIdx >= fActiveLimit) {
michael@0 4900 // At end of input. Match failed. Backtrack out.
michael@0 4901 fHitEnd = TRUE;
michael@0 4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4903 break;
michael@0 4904 }
michael@0 4905
michael@0 4906 // There is input left. Advance over one char, unless we've hit end-of-line
michael@0 4907 UChar32 c;
michael@0 4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 4909 if (c == 0x0a) {
michael@0 4910 // End of line in normal mode. '.' does not match the \n
michael@0 4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4912 }
michael@0 4913 }
michael@0 4914 break;
michael@0 4915
michael@0 4916
michael@0 4917 case URX_JMP:
michael@0 4918 fp->fPatIdx = opValue;
michael@0 4919 break;
michael@0 4920
michael@0 4921 case URX_FAIL:
michael@0 4922 isMatch = FALSE;
michael@0 4923 goto breakFromLoop;
michael@0 4924
michael@0 4925 case URX_JMP_SAV:
michael@0 4926 U_ASSERT(opValue < fPattern->fCompiledPat->size());
michael@0 4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
michael@0 4928 fp->fPatIdx = opValue; // Then JMP.
michael@0 4929 break;
michael@0 4930
michael@0 4931 case URX_JMP_SAV_X:
michael@0 4932 // This opcode is used with (x)+, when x can match a zero length string.
michael@0 4933 // Same as JMP_SAV, except conditional on the match having made forward progress.
michael@0 4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
michael@0 4935 // data address of the input position at the start of the loop.
michael@0 4936 {
michael@0 4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
michael@0 4938 int32_t stoOp = (int32_t)pat[opValue-1];
michael@0 4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
michael@0 4940 int32_t frameLoc = URX_VAL(stoOp);
michael@0 4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
michael@0 4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
michael@0 4943 U_ASSERT(prevInputIdx <= fp->fInputIdx);
michael@0 4944 if (prevInputIdx < fp->fInputIdx) {
michael@0 4945 // The match did make progress. Repeat the loop.
michael@0 4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
michael@0 4947 fp->fPatIdx = opValue;
michael@0 4948 fp->fExtra[frameLoc] = fp->fInputIdx;
michael@0 4949 }
michael@0 4950 // If the input position did not advance, we do nothing here,
michael@0 4951 // execution will fall out of the loop.
michael@0 4952 }
michael@0 4953 break;
michael@0 4954
michael@0 4955 case URX_CTR_INIT:
michael@0 4956 {
michael@0 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
michael@0 4958 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
michael@0 4959
michael@0 4960 // Pick up the three extra operands that CTR_INIT has, and
michael@0 4961 // skip the pattern location counter past
michael@0 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 4963 fp->fPatIdx += 3;
michael@0 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
michael@0 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
michael@0 4967 U_ASSERT(minCount>=0);
michael@0 4968 U_ASSERT(maxCount>=minCount || maxCount==-1);
michael@0 4969 U_ASSERT(loopLoc>=fp->fPatIdx);
michael@0 4970
michael@0 4971 if (minCount == 0) {
michael@0 4972 fp = StateSave(fp, loopLoc+1, status);
michael@0 4973 }
michael@0 4974 if (maxCount == -1) {
michael@0 4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
michael@0 4976 } else if (maxCount == 0) {
michael@0 4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 4978 }
michael@0 4979 }
michael@0 4980 break;
michael@0 4981
michael@0 4982 case URX_CTR_LOOP:
michael@0 4983 {
michael@0 4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
michael@0 4985 int32_t initOp = (int32_t)pat[opValue];
michael@0 4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
michael@0 4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
michael@0 4988 int32_t minCount = (int32_t)pat[opValue+2];
michael@0 4989 int32_t maxCount = (int32_t)pat[opValue+3];
michael@0 4990 (*pCounter)++;
michael@0 4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
michael@0 4992 U_ASSERT(*pCounter == maxCount);
michael@0 4993 break;
michael@0 4994 }
michael@0 4995 if (*pCounter >= minCount) {
michael@0 4996 if (maxCount == -1) {
michael@0 4997 // Loop has no hard upper bound.
michael@0 4998 // Check that it is progressing through the input, break if it is not.
michael@0 4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
michael@0 5000 if (fp->fInputIdx == *pLastInputIdx) {
michael@0 5001 break;
michael@0 5002 } else {
michael@0 5003 *pLastInputIdx = fp->fInputIdx;
michael@0 5004 }
michael@0 5005 }
michael@0 5006 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 5007 }
michael@0 5008 fp->fPatIdx = opValue + 4; // Loop back.
michael@0 5009 }
michael@0 5010 break;
michael@0 5011
michael@0 5012 case URX_CTR_INIT_NG:
michael@0 5013 {
michael@0 5014 // Initialize a non-greedy loop
michael@0 5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
michael@0 5016 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
michael@0 5017
michael@0 5018 // Pick up the three extra operands that CTR_INIT_NG has, and
michael@0 5019 // skip the pattern location counter past
michael@0 5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 5021 fp->fPatIdx += 3;
michael@0 5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
michael@0 5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
michael@0 5025 U_ASSERT(minCount>=0);
michael@0 5026 U_ASSERT(maxCount>=minCount || maxCount==-1);
michael@0 5027 U_ASSERT(loopLoc>fp->fPatIdx);
michael@0 5028 if (maxCount == -1) {
michael@0 5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
michael@0 5030 }
michael@0 5031
michael@0 5032 if (minCount == 0) {
michael@0 5033 if (maxCount != 0) {
michael@0 5034 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 5035 }
michael@0 5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
michael@0 5037 }
michael@0 5038 }
michael@0 5039 break;
michael@0 5040
michael@0 5041 case URX_CTR_LOOP_NG:
michael@0 5042 {
michael@0 5043 // Non-greedy {min, max} loops
michael@0 5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
michael@0 5045 int32_t initOp = (int32_t)pat[opValue];
michael@0 5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
michael@0 5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
michael@0 5048 int32_t minCount = (int32_t)pat[opValue+2];
michael@0 5049 int32_t maxCount = (int32_t)pat[opValue+3];
michael@0 5050
michael@0 5051 (*pCounter)++;
michael@0 5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
michael@0 5053 // The loop has matched the maximum permitted number of times.
michael@0 5054 // Break out of here with no action. Matching will
michael@0 5055 // continue with the following pattern.
michael@0 5056 U_ASSERT(*pCounter == maxCount);
michael@0 5057 break;
michael@0 5058 }
michael@0 5059
michael@0 5060 if (*pCounter < minCount) {
michael@0 5061 // We haven't met the minimum number of matches yet.
michael@0 5062 // Loop back for another one.
michael@0 5063 fp->fPatIdx = opValue + 4; // Loop back.
michael@0 5064 } else {
michael@0 5065 // We do have the minimum number of matches.
michael@0 5066
michael@0 5067 // If there is no upper bound on the loop iterations, check that the input index
michael@0 5068 // is progressing, and stop the loop if it is not.
michael@0 5069 if (maxCount == -1) {
michael@0 5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
michael@0 5071 if (fp->fInputIdx == *pLastInputIdx) {
michael@0 5072 break;
michael@0 5073 }
michael@0 5074 *pLastInputIdx = fp->fInputIdx;
michael@0 5075 }
michael@0 5076
michael@0 5077 // Loop Continuation: we will fall into the pattern following the loop
michael@0 5078 // (non-greedy, don't execute loop body first), but first do
michael@0 5079 // a state save to the top of the loop, so that a match failure
michael@0 5080 // in the following pattern will try another iteration of the loop.
michael@0 5081 fp = StateSave(fp, opValue + 4, status);
michael@0 5082 }
michael@0 5083 }
michael@0 5084 break;
michael@0 5085
michael@0 5086 case URX_STO_SP:
michael@0 5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
michael@0 5088 fData[opValue] = fStack->size();
michael@0 5089 break;
michael@0 5090
michael@0 5091 case URX_LD_SP:
michael@0 5092 {
michael@0 5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
michael@0 5094 int32_t newStackSize = (int32_t)fData[opValue];
michael@0 5095 U_ASSERT(newStackSize <= fStack->size());
michael@0 5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
michael@0 5097 if (newFP == (int64_t *)fp) {
michael@0 5098 break;
michael@0 5099 }
michael@0 5100 int32_t i;
michael@0 5101 for (i=0; i<fFrameSize; i++) {
michael@0 5102 newFP[i] = ((int64_t *)fp)[i];
michael@0 5103 }
michael@0 5104 fp = (REStackFrame *)newFP;
michael@0 5105 fStack->setSize(newStackSize);
michael@0 5106 }
michael@0 5107 break;
michael@0 5108
michael@0 5109 case URX_BACKREF:
michael@0 5110 {
michael@0 5111 U_ASSERT(opValue < fFrameSize);
michael@0 5112 int64_t groupStartIdx = fp->fExtra[opValue];
michael@0 5113 int64_t groupEndIdx = fp->fExtra[opValue+1];
michael@0 5114 U_ASSERT(groupStartIdx <= groupEndIdx);
michael@0 5115 int64_t inputIndex = fp->fInputIdx;
michael@0 5116 if (groupStartIdx < 0) {
michael@0 5117 // This capture group has not participated in the match thus far,
michael@0 5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
michael@0 5119 break;
michael@0 5120 }
michael@0 5121 UBool success = TRUE;
michael@0 5122 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
michael@0 5123 if (inputIndex >= fActiveLimit) {
michael@0 5124 success = FALSE;
michael@0 5125 fHitEnd = TRUE;
michael@0 5126 break;
michael@0 5127 }
michael@0 5128 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
michael@0 5129 success = FALSE;
michael@0 5130 break;
michael@0 5131 }
michael@0 5132 }
michael@0 5133 if (success) {
michael@0 5134 fp->fInputIdx = inputIndex;
michael@0 5135 } else {
michael@0 5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5137 }
michael@0 5138 }
michael@0 5139 break;
michael@0 5140
michael@0 5141 case URX_BACKREF_I:
michael@0 5142 {
michael@0 5143 U_ASSERT(opValue < fFrameSize);
michael@0 5144 int64_t groupStartIdx = fp->fExtra[opValue];
michael@0 5145 int64_t groupEndIdx = fp->fExtra[opValue+1];
michael@0 5146 U_ASSERT(groupStartIdx <= groupEndIdx);
michael@0 5147 if (groupStartIdx < 0) {
michael@0 5148 // This capture group has not participated in the match thus far,
michael@0 5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
michael@0 5150 break;
michael@0 5151 }
michael@0 5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
michael@0 5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
michael@0 5154
michael@0 5155 // Note: if the capture group match was of an empty string the backref
michael@0 5156 // match succeeds. Verified by testing: Perl matches succeed
michael@0 5157 // in this case, so we do too.
michael@0 5158
michael@0 5159 UBool success = TRUE;
michael@0 5160 for (;;) {
michael@0 5161 UChar32 captureGroupChar = captureGroupItr.next();
michael@0 5162 if (captureGroupChar == U_SENTINEL) {
michael@0 5163 success = TRUE;
michael@0 5164 break;
michael@0 5165 }
michael@0 5166 UChar32 inputChar = inputItr.next();
michael@0 5167 if (inputChar == U_SENTINEL) {
michael@0 5168 success = FALSE;
michael@0 5169 fHitEnd = TRUE;
michael@0 5170 break;
michael@0 5171 }
michael@0 5172 if (inputChar != captureGroupChar) {
michael@0 5173 success = FALSE;
michael@0 5174 break;
michael@0 5175 }
michael@0 5176 }
michael@0 5177
michael@0 5178 if (success && inputItr.inExpansion()) {
michael@0 5179 // We otained a match by consuming part of a string obtained from
michael@0 5180 // case-folding a single code point of the input text.
michael@0 5181 // This does not count as an overall match.
michael@0 5182 success = FALSE;
michael@0 5183 }
michael@0 5184
michael@0 5185 if (success) {
michael@0 5186 fp->fInputIdx = inputItr.getIndex();
michael@0 5187 } else {
michael@0 5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5189 }
michael@0 5190 }
michael@0 5191 break;
michael@0 5192
michael@0 5193 case URX_STO_INP_LOC:
michael@0 5194 {
michael@0 5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
michael@0 5196 fp->fExtra[opValue] = fp->fInputIdx;
michael@0 5197 }
michael@0 5198 break;
michael@0 5199
michael@0 5200 case URX_JMPX:
michael@0 5201 {
michael@0 5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
michael@0 5203 fp->fPatIdx += 1;
michael@0 5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
michael@0 5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
michael@0 5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
michael@0 5207 U_ASSERT(savedInputIdx <= fp->fInputIdx);
michael@0 5208 if (savedInputIdx < fp->fInputIdx) {
michael@0 5209 fp->fPatIdx = opValue; // JMP
michael@0 5210 } else {
michael@0 5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
michael@0 5212 }
michael@0 5213 }
michael@0 5214 break;
michael@0 5215
michael@0 5216 case URX_LA_START:
michael@0 5217 {
michael@0 5218 // Entering a lookahead block.
michael@0 5219 // Save Stack Ptr, Input Pos.
michael@0 5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5221 fData[opValue] = fStack->size();
michael@0 5222 fData[opValue+1] = fp->fInputIdx;
michael@0 5223 fActiveStart = fLookStart; // Set the match region change for
michael@0 5224 fActiveLimit = fLookLimit; // transparent bounds.
michael@0 5225 }
michael@0 5226 break;
michael@0 5227
michael@0 5228 case URX_LA_END:
michael@0 5229 {
michael@0 5230 // Leaving a look-ahead block.
michael@0 5231 // restore Stack Ptr, Input Pos to positions they had on entry to block.
michael@0 5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5233 int32_t stackSize = fStack->size();
michael@0 5234 int32_t newStackSize = (int32_t)fData[opValue];
michael@0 5235 U_ASSERT(stackSize >= newStackSize);
michael@0 5236 if (stackSize > newStackSize) {
michael@0 5237 // Copy the current top frame back to the new (cut back) top frame.
michael@0 5238 // This makes the capture groups from within the look-ahead
michael@0 5239 // expression available.
michael@0 5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
michael@0 5241 int32_t i;
michael@0 5242 for (i=0; i<fFrameSize; i++) {
michael@0 5243 newFP[i] = ((int64_t *)fp)[i];
michael@0 5244 }
michael@0 5245 fp = (REStackFrame *)newFP;
michael@0 5246 fStack->setSize(newStackSize);
michael@0 5247 }
michael@0 5248 fp->fInputIdx = fData[opValue+1];
michael@0 5249
michael@0 5250 // Restore the active region bounds in the input string; they may have
michael@0 5251 // been changed because of transparent bounds on a Region.
michael@0 5252 fActiveStart = fRegionStart;
michael@0 5253 fActiveLimit = fRegionLimit;
michael@0 5254 }
michael@0 5255 break;
michael@0 5256
michael@0 5257 case URX_ONECHAR_I:
michael@0 5258 if (fp->fInputIdx < fActiveLimit) {
michael@0 5259 UChar32 c;
michael@0 5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
michael@0 5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
michael@0 5262 break;
michael@0 5263 }
michael@0 5264 } else {
michael@0 5265 fHitEnd = TRUE;
michael@0 5266 }
michael@0 5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5268 break;
michael@0 5269
michael@0 5270 case URX_STRING_I:
michael@0 5271 // Case-insensitive test input against a literal string.
michael@0 5272 // Strings require two slots in the compiled pattern, one for the
michael@0 5273 // offset to the string text, and one for the length.
michael@0 5274 // The compiled string has already been case folded.
michael@0 5275 {
michael@0 5276 const UChar *patternString = litText + opValue;
michael@0 5277
michael@0 5278 op = (int32_t)pat[fp->fPatIdx];
michael@0 5279 fp->fPatIdx++;
michael@0 5280 opType = URX_TYPE(op);
michael@0 5281 opValue = URX_VAL(op);
michael@0 5282 U_ASSERT(opType == URX_STRING_LEN);
michael@0 5283 int32_t patternStringLen = opValue; // Length of the string from the pattern.
michael@0 5284
michael@0 5285 UChar32 cText;
michael@0 5286 UChar32 cPattern;
michael@0 5287 UBool success = TRUE;
michael@0 5288 int32_t patternStringIdx = 0;
michael@0 5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
michael@0 5290 while (patternStringIdx < patternStringLen) {
michael@0 5291 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
michael@0 5292 cText = inputIterator.next();
michael@0 5293 if (cText != cPattern) {
michael@0 5294 success = FALSE;
michael@0 5295 if (cText == U_SENTINEL) {
michael@0 5296 fHitEnd = TRUE;
michael@0 5297 }
michael@0 5298 break;
michael@0 5299 }
michael@0 5300 }
michael@0 5301 if (inputIterator.inExpansion()) {
michael@0 5302 success = FALSE;
michael@0 5303 }
michael@0 5304
michael@0 5305 if (success) {
michael@0 5306 fp->fInputIdx = inputIterator.getIndex();
michael@0 5307 } else {
michael@0 5308 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5309 }
michael@0 5310 }
michael@0 5311 break;
michael@0 5312
michael@0 5313 case URX_LB_START:
michael@0 5314 {
michael@0 5315 // Entering a look-behind block.
michael@0 5316 // Save Stack Ptr, Input Pos.
michael@0 5317 // TODO: implement transparent bounds. Ticket #6067
michael@0 5318 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5319 fData[opValue] = fStack->size();
michael@0 5320 fData[opValue+1] = fp->fInputIdx;
michael@0 5321 // Init the variable containing the start index for attempted matches.
michael@0 5322 fData[opValue+2] = -1;
michael@0 5323 // Save input string length, then reset to pin any matches to end at
michael@0 5324 // the current position.
michael@0 5325 fData[opValue+3] = fActiveLimit;
michael@0 5326 fActiveLimit = fp->fInputIdx;
michael@0 5327 }
michael@0 5328 break;
michael@0 5329
michael@0 5330
michael@0 5331 case URX_LB_CONT:
michael@0 5332 {
michael@0 5333 // Positive Look-Behind, at top of loop checking for matches of LB expression
michael@0 5334 // at all possible input starting positions.
michael@0 5335
michael@0 5336 // Fetch the min and max possible match lengths. They are the operands
michael@0 5337 // of this op in the pattern.
michael@0 5338 int32_t minML = (int32_t)pat[fp->fPatIdx++];
michael@0 5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
michael@0 5340 U_ASSERT(minML <= maxML);
michael@0 5341 U_ASSERT(minML >= 0);
michael@0 5342
michael@0 5343 // Fetch (from data) the last input index where a match was attempted.
michael@0 5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5345 int64_t *lbStartIdx = &fData[opValue+2];
michael@0 5346 if (*lbStartIdx < 0) {
michael@0 5347 // First time through loop.
michael@0 5348 *lbStartIdx = fp->fInputIdx - minML;
michael@0 5349 } else {
michael@0 5350 // 2nd through nth time through the loop.
michael@0 5351 // Back up start position for match by one.
michael@0 5352 if (*lbStartIdx == 0) {
michael@0 5353 (*lbStartIdx)--;
michael@0 5354 } else {
michael@0 5355 U16_BACK_1(inputBuf, 0, *lbStartIdx);
michael@0 5356 }
michael@0 5357 }
michael@0 5358
michael@0 5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
michael@0 5360 // We have tried all potential match starting points without
michael@0 5361 // getting a match. Backtrack out, and out of the
michael@0 5362 // Look Behind altogether.
michael@0 5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5364 int64_t restoreInputLen = fData[opValue+3];
michael@0 5365 U_ASSERT(restoreInputLen >= fActiveLimit);
michael@0 5366 U_ASSERT(restoreInputLen <= fInputLength);
michael@0 5367 fActiveLimit = restoreInputLen;
michael@0 5368 break;
michael@0 5369 }
michael@0 5370
michael@0 5371 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
michael@0 5372 // (successful match will fall off the end of the loop.)
michael@0 5373 fp = StateSave(fp, fp->fPatIdx-3, status);
michael@0 5374 fp->fInputIdx = *lbStartIdx;
michael@0 5375 }
michael@0 5376 break;
michael@0 5377
michael@0 5378 case URX_LB_END:
michael@0 5379 // End of a look-behind block, after a successful match.
michael@0 5380 {
michael@0 5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5382 if (fp->fInputIdx != fActiveLimit) {
michael@0 5383 // The look-behind expression matched, but the match did not
michael@0 5384 // extend all the way to the point that we are looking behind from.
michael@0 5385 // FAIL out of here, which will take us back to the LB_CONT, which
michael@0 5386 // will retry the match starting at another position or fail
michael@0 5387 // the look-behind altogether, whichever is appropriate.
michael@0 5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5389 break;
michael@0 5390 }
michael@0 5391
michael@0 5392 // Look-behind match is good. Restore the orignal input string length,
michael@0 5393 // which had been truncated to pin the end of the lookbehind match to the
michael@0 5394 // position being looked-behind.
michael@0 5395 int64_t originalInputLen = fData[opValue+3];
michael@0 5396 U_ASSERT(originalInputLen >= fActiveLimit);
michael@0 5397 U_ASSERT(originalInputLen <= fInputLength);
michael@0 5398 fActiveLimit = originalInputLen;
michael@0 5399 }
michael@0 5400 break;
michael@0 5401
michael@0 5402
michael@0 5403 case URX_LBN_CONT:
michael@0 5404 {
michael@0 5405 // Negative Look-Behind, at top of loop checking for matches of LB expression
michael@0 5406 // at all possible input starting positions.
michael@0 5407
michael@0 5408 // Fetch the extra parameters of this op.
michael@0 5409 int32_t minML = (int32_t)pat[fp->fPatIdx++];
michael@0 5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
michael@0 5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
michael@0 5412 continueLoc = URX_VAL(continueLoc);
michael@0 5413 U_ASSERT(minML <= maxML);
michael@0 5414 U_ASSERT(minML >= 0);
michael@0 5415 U_ASSERT(continueLoc > fp->fPatIdx);
michael@0 5416
michael@0 5417 // Fetch (from data) the last input index where a match was attempted.
michael@0 5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5419 int64_t *lbStartIdx = &fData[opValue+2];
michael@0 5420 if (*lbStartIdx < 0) {
michael@0 5421 // First time through loop.
michael@0 5422 *lbStartIdx = fp->fInputIdx - minML;
michael@0 5423 } else {
michael@0 5424 // 2nd through nth time through the loop.
michael@0 5425 // Back up start position for match by one.
michael@0 5426 if (*lbStartIdx == 0) {
michael@0 5427 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
michael@0 5428 } else {
michael@0 5429 U16_BACK_1(inputBuf, 0, *lbStartIdx);
michael@0 5430 }
michael@0 5431 }
michael@0 5432
michael@0 5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
michael@0 5434 // We have tried all potential match starting points without
michael@0 5435 // getting a match, which means that the negative lookbehind as
michael@0 5436 // a whole has succeeded. Jump forward to the continue location
michael@0 5437 int64_t restoreInputLen = fData[opValue+3];
michael@0 5438 U_ASSERT(restoreInputLen >= fActiveLimit);
michael@0 5439 U_ASSERT(restoreInputLen <= fInputLength);
michael@0 5440 fActiveLimit = restoreInputLen;
michael@0 5441 fp->fPatIdx = continueLoc;
michael@0 5442 break;
michael@0 5443 }
michael@0 5444
michael@0 5445 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
michael@0 5446 // (successful match will cause a FAIL out of the loop altogether.)
michael@0 5447 fp = StateSave(fp, fp->fPatIdx-4, status);
michael@0 5448 fp->fInputIdx = *lbStartIdx;
michael@0 5449 }
michael@0 5450 break;
michael@0 5451
michael@0 5452 case URX_LBN_END:
michael@0 5453 // End of a negative look-behind block, after a successful match.
michael@0 5454 {
michael@0 5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5456 if (fp->fInputIdx != fActiveLimit) {
michael@0 5457 // The look-behind expression matched, but the match did not
michael@0 5458 // extend all the way to the point that we are looking behind from.
michael@0 5459 // FAIL out of here, which will take us back to the LB_CONT, which
michael@0 5460 // will retry the match starting at another position or succeed
michael@0 5461 // the look-behind altogether, whichever is appropriate.
michael@0 5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5463 break;
michael@0 5464 }
michael@0 5465
michael@0 5466 // Look-behind expression matched, which means look-behind test as
michael@0 5467 // a whole Fails
michael@0 5468
michael@0 5469 // Restore the orignal input string length, which had been truncated
michael@0 5470 // inorder to pin the end of the lookbehind match
michael@0 5471 // to the position being looked-behind.
michael@0 5472 int64_t originalInputLen = fData[opValue+3];
michael@0 5473 U_ASSERT(originalInputLen >= fActiveLimit);
michael@0 5474 U_ASSERT(originalInputLen <= fInputLength);
michael@0 5475 fActiveLimit = originalInputLen;
michael@0 5476
michael@0 5477 // Restore original stack position, discarding any state saved
michael@0 5478 // by the successful pattern match.
michael@0 5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
michael@0 5480 int32_t newStackSize = (int32_t)fData[opValue];
michael@0 5481 U_ASSERT(fStack->size() > newStackSize);
michael@0 5482 fStack->setSize(newStackSize);
michael@0 5483
michael@0 5484 // FAIL, which will take control back to someplace
michael@0 5485 // prior to entering the look-behind test.
michael@0 5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
michael@0 5487 }
michael@0 5488 break;
michael@0 5489
michael@0 5490
michael@0 5491 case URX_LOOP_SR_I:
michael@0 5492 // Loop Initialization for the optimized implementation of
michael@0 5493 // [some character set]*
michael@0 5494 // This op scans through all matching input.
michael@0 5495 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
michael@0 5496 {
michael@0 5497 U_ASSERT(opValue > 0 && opValue < sets->size());
michael@0 5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
michael@0 5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
michael@0 5500
michael@0 5501 // Loop through input, until either the input is exhausted or
michael@0 5502 // we reach a character that is not a member of the set.
michael@0 5503 int32_t ix = (int32_t)fp->fInputIdx;
michael@0 5504 for (;;) {
michael@0 5505 if (ix >= fActiveLimit) {
michael@0 5506 fHitEnd = TRUE;
michael@0 5507 break;
michael@0 5508 }
michael@0 5509 UChar32 c;
michael@0 5510 U16_NEXT(inputBuf, ix, fActiveLimit, c);
michael@0 5511 if (c<256) {
michael@0 5512 if (s8->contains(c) == FALSE) {
michael@0 5513 U16_BACK_1(inputBuf, 0, ix);
michael@0 5514 break;
michael@0 5515 }
michael@0 5516 } else {
michael@0 5517 if (s->contains(c) == FALSE) {
michael@0 5518 U16_BACK_1(inputBuf, 0, ix);
michael@0 5519 break;
michael@0 5520 }
michael@0 5521 }
michael@0 5522 }
michael@0 5523
michael@0 5524 // If there were no matching characters, skip over the loop altogether.
michael@0 5525 // The loop doesn't run at all, a * op always succeeds.
michael@0 5526 if (ix == fp->fInputIdx) {
michael@0 5527 fp->fPatIdx++; // skip the URX_LOOP_C op.
michael@0 5528 break;
michael@0 5529 }
michael@0 5530
michael@0 5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
michael@0 5532 // must follow. It's operand is the stack location
michael@0 5533 // that holds the starting input index for the match of this [set]*
michael@0 5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
michael@0 5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
michael@0 5536 int32_t stackLoc = URX_VAL(loopcOp);
michael@0 5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
michael@0 5538 fp->fExtra[stackLoc] = fp->fInputIdx;
michael@0 5539 fp->fInputIdx = ix;
michael@0 5540
michael@0 5541 // Save State to the URX_LOOP_C op that follows this one,
michael@0 5542 // so that match failures in the following code will return to there.
michael@0 5543 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
michael@0 5544 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 5545 fp->fPatIdx++;
michael@0 5546 }
michael@0 5547 break;
michael@0 5548
michael@0 5549
michael@0 5550 case URX_LOOP_DOT_I:
michael@0 5551 // Loop Initialization for the optimized implementation of .*
michael@0 5552 // This op scans through all remaining input.
michael@0 5553 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
michael@0 5554 {
michael@0 5555 // Loop through input until the input is exhausted (we reach an end-of-line)
michael@0 5556 // In DOTALL mode, we can just go straight to the end of the input.
michael@0 5557 int32_t ix;
michael@0 5558 if ((opValue & 1) == 1) {
michael@0 5559 // Dot-matches-All mode. Jump straight to the end of the string.
michael@0 5560 ix = (int32_t)fActiveLimit;
michael@0 5561 fHitEnd = TRUE;
michael@0 5562 } else {
michael@0 5563 // NOT DOT ALL mode. Line endings do not match '.'
michael@0 5564 // Scan forward until a line ending or end of input.
michael@0 5565 ix = (int32_t)fp->fInputIdx;
michael@0 5566 for (;;) {
michael@0 5567 if (ix >= fActiveLimit) {
michael@0 5568 fHitEnd = TRUE;
michael@0 5569 break;
michael@0 5570 }
michael@0 5571 UChar32 c;
michael@0 5572 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
michael@0 5573 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
michael@0 5574 if ((c == 0x0a) || // 0x0a is newline in both modes.
michael@0 5575 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
michael@0 5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
michael@0 5577 // char is a line ending. Put the input pos back to the
michael@0 5578 // line ending char, and exit the scanning loop.
michael@0 5579 U16_BACK_1(inputBuf, 0, ix);
michael@0 5580 break;
michael@0 5581 }
michael@0 5582 }
michael@0 5583 }
michael@0 5584 }
michael@0 5585
michael@0 5586 // If there were no matching characters, skip over the loop altogether.
michael@0 5587 // The loop doesn't run at all, a * op always succeeds.
michael@0 5588 if (ix == fp->fInputIdx) {
michael@0 5589 fp->fPatIdx++; // skip the URX_LOOP_C op.
michael@0 5590 break;
michael@0 5591 }
michael@0 5592
michael@0 5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
michael@0 5594 // must follow. It's operand is the stack location
michael@0 5595 // that holds the starting input index for the match of this .*
michael@0 5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
michael@0 5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
michael@0 5598 int32_t stackLoc = URX_VAL(loopcOp);
michael@0 5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
michael@0 5600 fp->fExtra[stackLoc] = fp->fInputIdx;
michael@0 5601 fp->fInputIdx = ix;
michael@0 5602
michael@0 5603 // Save State to the URX_LOOP_C op that follows this one,
michael@0 5604 // so that match failures in the following code will return to there.
michael@0 5605 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
michael@0 5606 fp = StateSave(fp, fp->fPatIdx, status);
michael@0 5607 fp->fPatIdx++;
michael@0 5608 }
michael@0 5609 break;
michael@0 5610
michael@0 5611
michael@0 5612 case URX_LOOP_C:
michael@0 5613 {
michael@0 5614 U_ASSERT(opValue>=0 && opValue<fFrameSize);
michael@0 5615 backSearchIndex = (int32_t)fp->fExtra[opValue];
michael@0 5616 U_ASSERT(backSearchIndex <= fp->fInputIdx);
michael@0 5617 if (backSearchIndex == fp->fInputIdx) {
michael@0 5618 // We've backed up the input idx to the point that the loop started.
michael@0 5619 // The loop is done. Leave here without saving state.
michael@0 5620 // Subsequent failures won't come back here.
michael@0 5621 break;
michael@0 5622 }
michael@0 5623 // Set up for the next iteration of the loop, with input index
michael@0 5624 // backed up by one from the last time through,
michael@0 5625 // and a state save to this instruction in case the following code fails again.
michael@0 5626 // (We're going backwards because this loop emulates stack unwinding, not
michael@0 5627 // the initial scan forward.)
michael@0 5628 U_ASSERT(fp->fInputIdx > 0);
michael@0 5629 UChar32 prevC;
michael@0 5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
michael@0 5631
michael@0 5632 if (prevC == 0x0a &&
michael@0 5633 fp->fInputIdx > backSearchIndex &&
michael@0 5634 inputBuf[fp->fInputIdx-1] == 0x0d) {
michael@0 5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
michael@0 5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
michael@0 5637 // .*, stepping back over CRLF pair.
michael@0 5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
michael@0 5639 }
michael@0 5640 }
michael@0 5641
michael@0 5642
michael@0 5643 fp = StateSave(fp, fp->fPatIdx-1, status);
michael@0 5644 }
michael@0 5645 break;
michael@0 5646
michael@0 5647
michael@0 5648
michael@0 5649 default:
michael@0 5650 // Trouble. The compiled pattern contains an entry with an
michael@0 5651 // unrecognized type tag.
michael@0 5652 U_ASSERT(FALSE);
michael@0 5653 }
michael@0 5654
michael@0 5655 if (U_FAILURE(status)) {
michael@0 5656 isMatch = FALSE;
michael@0 5657 break;
michael@0 5658 }
michael@0 5659 }
michael@0 5660
michael@0 5661 breakFromLoop:
michael@0 5662 fMatch = isMatch;
michael@0 5663 if (isMatch) {
michael@0 5664 fLastMatchEnd = fMatchEnd;
michael@0 5665 fMatchStart = startIdx;
michael@0 5666 fMatchEnd = fp->fInputIdx;
michael@0 5667 if (fTraceDebug) {
michael@0 5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
michael@0 5669 }
michael@0 5670 }
michael@0 5671 else
michael@0 5672 {
michael@0 5673 if (fTraceDebug) {
michael@0 5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
michael@0 5675 }
michael@0 5676 }
michael@0 5677
michael@0 5678 fFrame = fp; // The active stack frame when the engine stopped.
michael@0 5679 // Contains the capture group results that we need to
michael@0 5680 // access later.
michael@0 5681
michael@0 5682 return;
michael@0 5683 }
michael@0 5684
michael@0 5685
michael@0 5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
michael@0 5687
michael@0 5688 U_NAMESPACE_END
michael@0 5689
michael@0 5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial