Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2013 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
34 // #include <malloc.h> // Needed for heapcheck testing
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
40 //
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
45 // Smart Backtracking
46 // ------------------
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
52 //
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
56 U_NAMESPACE_BEGIN
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
108 fInputUniStrMaybeMutable = TRUE;
109 }
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
129 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
233 fStack = NULL;
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
263 fStack = new UVector64(status);
264 if (fStack == NULL) {
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266 return;
267 }
269 reset(input);
270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271 if (U_FAILURE(status)) {
272 fDeferredStatus = status;
273 return;
274 }
275 }
278 static const UChar BACKSLASH = 0x5c;
279 static const UChar DOLLARSIGN = 0x24;
280 //--------------------------------------------------------------------------------
281 //
282 // appendReplacement
283 //
284 //--------------------------------------------------------------------------------
285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286 const UnicodeString &replacement,
287 UErrorCode &status) {
288 UText replacementText = UTEXT_INITIALIZER;
290 utext_openConstUnicodeString(&replacementText, &replacement, &status);
291 if (U_SUCCESS(status)) {
292 UText resultText = UTEXT_INITIALIZER;
293 utext_openUnicodeString(&resultText, &dest, &status);
295 if (U_SUCCESS(status)) {
296 appendReplacement(&resultText, &replacementText, status);
297 utext_close(&resultText);
298 }
299 utext_close(&replacementText);
300 }
302 return *this;
303 }
305 //
306 // appendReplacement, UText mode
307 //
308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309 UText *replacement,
310 UErrorCode &status) {
311 if (U_FAILURE(status)) {
312 return *this;
313 }
314 if (U_FAILURE(fDeferredStatus)) {
315 status = fDeferredStatus;
316 return *this;
317 }
318 if (fMatch == FALSE) {
319 status = U_REGEX_INVALID_STATE;
320 return *this;
321 }
323 // Copy input string from the end of previous match to start of current match
324 int64_t destLen = utext_nativeLength(dest);
325 if (fMatchStart > fAppendPosition) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328 (int32_t)(fMatchStart-fAppendPosition), &status);
329 } else {
330 int32_t len16;
331 if (UTEXT_USES_U16(fInputText)) {
332 len16 = (int32_t)(fMatchStart-fAppendPosition);
333 } else {
334 UErrorCode lengthStatus = U_ZERO_ERROR;
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336 }
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338 if (inputChars == NULL) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return *this;
341 }
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344 uprv_free(inputChars);
345 }
346 }
347 fAppendPosition = fMatchEnd;
350 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement, 0);
354 UChar32 c = UTEXT_NEXT32(replacement);
355 while (c != U_SENTINEL) {
356 if (c == BACKSLASH) {
357 // Backslash Escape. Copy the following char out without further checks.
358 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the next
361 // loop iteration.
362 c = UTEXT_CURRENT32(replacement);
363 if (c == U_SENTINEL) {
364 break;
365 }
367 if (c==0x55/*U*/ || c==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence.
369 int32_t offset = 0;
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372 if (escapedChar != (UChar32)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar)) {
374 UChar c16 = (UChar)escapedChar;
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376 } else {
377 UChar surrogate[2];
378 surrogate[0] = U16_LEAD(escapedChar);
379 surrogate[1] = U16_TRAIL(escapedChar);
380 if (U_SUCCESS(status)) {
381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382 }
383 }
384 // TODO: Report errors for mal-formed \u escapes?
385 // As this is, the original sequence is output, which may be OK.
386 if (context.lastOffset == offset) {
387 (void)UTEXT_PREVIOUS32(replacement);
388 } else if (context.lastOffset != offset-1) {
389 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390 }
391 }
392 } else {
393 (void)UTEXT_NEXT32(replacement);
394 // Plain backslash escape. Just put out the escaped character.
395 if (U_IS_BMP(c)) {
396 UChar c16 = (UChar)c;
397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398 } else {
399 UChar surrogate[2];
400 surrogate[0] = U16_LEAD(c);
401 surrogate[1] = U16_TRAIL(c);
402 if (U_SUCCESS(status)) {
403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404 }
405 }
406 }
407 } else if (c != DOLLARSIGN) {
408 // Normal char, not a $. Copy it out without further checks.
409 if (U_IS_BMP(c)) {
410 UChar c16 = (UChar)c;
411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412 } else {
413 UChar surrogate[2];
414 surrogate[0] = U16_LEAD(c);
415 surrogate[1] = U16_TRAIL(c);
416 if (U_SUCCESS(status)) {
417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418 }
419 }
420 } else {
421 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest capture
423 // number that is valid for this pattern.
425 int32_t numDigits = 0;
426 int32_t groupNum = 0;
427 UChar32 digitC;
428 for (;;) {
429 digitC = UTEXT_CURRENT32(replacement);
430 if (digitC == U_SENTINEL) {
431 break;
432 }
433 if (u_isdigit(digitC) == FALSE) {
434 break;
435 }
436 (void)UTEXT_NEXT32(replacement);
437 groupNum=groupNum*10 + u_charDigitValue(digitC);
438 numDigits++;
439 if (numDigits >= fPattern->fMaxCaptureDigits) {
440 break;
441 }
442 }
445 if (numDigits == 0) {
446 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text.
448 UChar c16 = DOLLARSIGN;
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450 } else {
451 // Finally, append the capture group data to the destination.
452 destLen += appendGroup(groupNum, dest, status);
453 if (U_FAILURE(status)) {
454 // Can fail if group number is out of range.
455 break;
456 }
457 }
458 }
460 if (U_FAILURE(status)) {
461 break;
462 } else {
463 c = UTEXT_NEXT32(replacement);
464 }
465 }
467 return *this;
468 }
472 //--------------------------------------------------------------------------------
473 //
474 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following
476 // the last match position from the input string.
477 //
478 // Note: Match ranges do not affect appendTail or appendReplacement
479 //
480 //--------------------------------------------------------------------------------
481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482 UErrorCode status = U_ZERO_ERROR;
483 UText resultText = UTEXT_INITIALIZER;
484 utext_openUnicodeString(&resultText, &dest, &status);
486 if (U_SUCCESS(status)) {
487 appendTail(&resultText, status);
488 utext_close(&resultText);
489 }
491 return dest;
492 }
494 //
495 // appendTail, UText mode
496 //
497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498 UBool bailOut = FALSE;
499 if (U_FAILURE(status)) {
500 bailOut = TRUE;
501 }
502 if (U_FAILURE(fDeferredStatus)) {
503 status = fDeferredStatus;
504 bailOut = TRUE;
505 }
507 if (bailOut) {
508 // dest must not be NULL
509 if (dest) {
510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
511 return dest;
512 }
513 }
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
582 return e;
583 }
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
590 //--------------------------------------------------------------------------------
591 //
592 // find()
593 //
594 //--------------------------------------------------------------------------------
595 UBool RegexMatcher::find() {
596 // Start at the position of the last match end. (Will be zero if the
597 // matcher has been reset.)
598 //
599 if (U_FAILURE(fDeferredStatus)) {
600 return FALSE;
601 }
603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
604 return findUsingChunk();
605 }
607 int64_t startPos = fMatchEnd;
608 if (startPos==0) {
609 startPos = fActiveStart;
610 }
612 if (fMatch) {
613 // Save the position of any previous successful match.
614 fLastMatchEnd = fMatchEnd;
616 if (fMatchStart == fMatchEnd) {
617 // Previous match had zero length. Move start position up one position
618 // to avoid sending find() into a loop on zero-length matches.
619 if (startPos >= fActiveLimit) {
620 fMatch = FALSE;
621 fHitEnd = TRUE;
622 return FALSE;
623 }
624 UTEXT_SETNATIVEINDEX(fInputText, startPos);
625 (void)UTEXT_NEXT32(fInputText);
626 startPos = UTEXT_GETNATIVEINDEX(fInputText);
627 }
628 } else {
629 if (fLastMatchEnd >= 0) {
630 // A previous find() failed to match. Don't try again.
631 // (without this test, a pattern with a zero-length match
632 // could match again at the end of an input string.)
633 fHitEnd = TRUE;
634 return FALSE;
635 }
636 }
639 // Compute the position in the input string beyond which a match can not begin, because
640 // the minimum length match would extend past the end of the input.
641 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
642 // Be aware of possible overflows if making changes here.
643 int64_t testStartLimit;
644 if (UTEXT_USES_U16(fInputText)) {
645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
646 if (startPos > testStartLimit) {
647 fMatch = FALSE;
648 fHitEnd = TRUE;
649 return FALSE;
650 }
651 } else {
652 // For now, let the matcher discover that it can't match on its own
653 // We don't know how long the match len is in native characters
654 testStartLimit = fActiveLimit;
655 }
657 UChar32 c;
658 U_ASSERT(startPos >= 0);
660 switch (fPattern->fStartType) {
661 case START_NO_INFO:
662 // No optimization was found.
663 // Try a match at each input position.
664 for (;;) {
665 MatchAt(startPos, FALSE, fDeferredStatus);
666 if (U_FAILURE(fDeferredStatus)) {
667 return FALSE;
668 }
669 if (fMatch) {
670 return TRUE;
671 }
672 if (startPos >= testStartLimit) {
673 fHitEnd = TRUE;
674 return FALSE;
675 }
676 UTEXT_SETNATIVEINDEX(fInputText, startPos);
677 (void)UTEXT_NEXT32(fInputText);
678 startPos = UTEXT_GETNATIVEINDEX(fInputText);
679 // Note that it's perfectly OK for a pattern to have a zero-length
680 // match at the end of a string, so we must make sure that the loop
681 // runs with startPos == testStartLimit the last time through.
682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
683 return FALSE;
684 }
685 U_ASSERT(FALSE);
687 case START_START:
688 // Matches are only possible at the start of the input string
689 // (pattern begins with ^ or \A)
690 if (startPos > fActiveStart) {
691 fMatch = FALSE;
692 return FALSE;
693 }
694 MatchAt(startPos, FALSE, fDeferredStatus);
695 if (U_FAILURE(fDeferredStatus)) {
696 return FALSE;
697 }
698 return fMatch;
701 case START_SET:
702 {
703 // Match may start on any char from a pre-computed set.
704 U_ASSERT(fPattern->fMinMatchLen > 0);
705 int64_t pos;
706 UTEXT_SETNATIVEINDEX(fInputText, startPos);
707 for (;;) {
708 c = UTEXT_NEXT32(fInputText);
709 pos = UTEXT_GETNATIVEINDEX(fInputText);
710 // c will be -1 (U_SENTINEL) at end of text, in which case we
711 // skip this next block (so we don't have a negative array index)
712 // and handle end of text in the following block.
713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
714 (c>=256 && fPattern->fInitialChars->contains(c)))) {
715 MatchAt(startPos, FALSE, fDeferredStatus);
716 if (U_FAILURE(fDeferredStatus)) {
717 return FALSE;
718 }
719 if (fMatch) {
720 return TRUE;
721 }
722 UTEXT_SETNATIVEINDEX(fInputText, pos);
723 }
724 if (startPos >= testStartLimit) {
725 fMatch = FALSE;
726 fHitEnd = TRUE;
727 return FALSE;
728 }
729 startPos = pos;
730 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
731 return FALSE;
732 }
733 }
734 U_ASSERT(FALSE);
736 case START_STRING:
737 case START_CHAR:
738 {
739 // Match starts on exactly one char.
740 U_ASSERT(fPattern->fMinMatchLen > 0);
741 UChar32 theChar = fPattern->fInitialChar;
742 int64_t pos;
743 UTEXT_SETNATIVEINDEX(fInputText, startPos);
744 for (;;) {
745 c = UTEXT_NEXT32(fInputText);
746 pos = UTEXT_GETNATIVEINDEX(fInputText);
747 if (c == theChar) {
748 MatchAt(startPos, FALSE, fDeferredStatus);
749 if (U_FAILURE(fDeferredStatus)) {
750 return FALSE;
751 }
752 if (fMatch) {
753 return TRUE;
754 }
755 UTEXT_SETNATIVEINDEX(fInputText, pos);
756 }
757 if (startPos >= testStartLimit) {
758 fMatch = FALSE;
759 fHitEnd = TRUE;
760 return FALSE;
761 }
762 startPos = pos;
763 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
764 return FALSE;
765 }
766 }
767 U_ASSERT(FALSE);
769 case START_LINE:
770 {
771 UChar32 c;
772 if (startPos == fAnchorStart) {
773 MatchAt(startPos, FALSE, fDeferredStatus);
774 if (U_FAILURE(fDeferredStatus)) {
775 return FALSE;
776 }
777 if (fMatch) {
778 return TRUE;
779 }
780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 c = UTEXT_NEXT32(fInputText);
782 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783 } else {
784 UTEXT_SETNATIVEINDEX(fInputText, startPos);
785 c = UTEXT_PREVIOUS32(fInputText);
786 UTEXT_SETNATIVEINDEX(fInputText, startPos);
787 }
789 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
790 for (;;) {
791 if (c == 0x0a) {
792 MatchAt(startPos, FALSE, fDeferredStatus);
793 if (U_FAILURE(fDeferredStatus)) {
794 return FALSE;
795 }
796 if (fMatch) {
797 return TRUE;
798 }
799 UTEXT_SETNATIVEINDEX(fInputText, startPos);
800 }
801 if (startPos >= testStartLimit) {
802 fMatch = FALSE;
803 fHitEnd = TRUE;
804 return FALSE;
805 }
806 c = UTEXT_NEXT32(fInputText);
807 startPos = UTEXT_GETNATIVEINDEX(fInputText);
808 // Note that it's perfectly OK for a pattern to have a zero-length
809 // match at the end of a string, so we must make sure that the loop
810 // runs with startPos == testStartLimit the last time through.
811 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
812 return FALSE;
813 }
814 } else {
815 for (;;) {
816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
819 (void)UTEXT_NEXT32(fInputText);
820 startPos = UTEXT_GETNATIVEINDEX(fInputText);
821 }
822 MatchAt(startPos, FALSE, fDeferredStatus);
823 if (U_FAILURE(fDeferredStatus)) {
824 return FALSE;
825 }
826 if (fMatch) {
827 return TRUE;
828 }
829 UTEXT_SETNATIVEINDEX(fInputText, startPos);
830 }
831 if (startPos >= testStartLimit) {
832 fMatch = FALSE;
833 fHitEnd = TRUE;
834 return FALSE;
835 }
836 c = UTEXT_NEXT32(fInputText);
837 startPos = UTEXT_GETNATIVEINDEX(fInputText);
838 // Note that it's perfectly OK for a pattern to have a zero-length
839 // match at the end of a string, so we must make sure that the loop
840 // runs with startPos == testStartLimit the last time through.
841 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
842 return FALSE;
843 }
844 }
845 }
847 default:
848 U_ASSERT(FALSE);
849 }
851 U_ASSERT(FALSE);
852 return FALSE;
853 }
857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
858 if (U_FAILURE(status)) {
859 return FALSE;
860 }
861 if (U_FAILURE(fDeferredStatus)) {
862 status = fDeferredStatus;
863 return FALSE;
864 }
865 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
866 // This will reset the region to be the full input length.
867 if (start < 0) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE;
870 }
872 int64_t nativeStart = start;
873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
874 status = U_INDEX_OUTOFBOUNDS_ERROR;
875 return FALSE;
876 }
877 fMatchEnd = nativeStart;
878 return find();
879 }
882 //--------------------------------------------------------------------------------
883 //
884 // findUsingChunk() -- like find(), but with the advance knowledge that the
885 // entire string is available in the UText's chunk buffer.
886 //
887 //--------------------------------------------------------------------------------
888 UBool RegexMatcher::findUsingChunk() {
889 // Start at the position of the last match end. (Will be zero if the
890 // matcher has been reset.
891 //
893 int32_t startPos = (int32_t)fMatchEnd;
894 if (startPos==0) {
895 startPos = (int32_t)fActiveStart;
896 }
898 const UChar *inputBuf = fInputText->chunkContents;
900 if (fMatch) {
901 // Save the position of any previous successful match.
902 fLastMatchEnd = fMatchEnd;
904 if (fMatchStart == fMatchEnd) {
905 // Previous match had zero length. Move start position up one position
906 // to avoid sending find() into a loop on zero-length matches.
907 if (startPos >= fActiveLimit) {
908 fMatch = FALSE;
909 fHitEnd = TRUE;
910 return FALSE;
911 }
912 U16_FWD_1(inputBuf, startPos, fInputLength);
913 }
914 } else {
915 if (fLastMatchEnd >= 0) {
916 // A previous find() failed to match. Don't try again.
917 // (without this test, a pattern with a zero-length match
918 // could match again at the end of an input string.)
919 fHitEnd = TRUE;
920 return FALSE;
921 }
922 }
925 // Compute the position in the input string beyond which a match can not begin, because
926 // the minimum length match would extend past the end of the input.
927 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
928 // Be aware of possible overflows if making changes here.
929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
930 if (startPos > testLen) {
931 fMatch = FALSE;
932 fHitEnd = TRUE;
933 return FALSE;
934 }
936 UChar32 c;
937 U_ASSERT(startPos >= 0);
939 switch (fPattern->fStartType) {
940 case START_NO_INFO:
941 // No optimization was found.
942 // Try a match at each input position.
943 for (;;) {
944 MatchChunkAt(startPos, FALSE, fDeferredStatus);
945 if (U_FAILURE(fDeferredStatus)) {
946 return FALSE;
947 }
948 if (fMatch) {
949 return TRUE;
950 }
951 if (startPos >= testLen) {
952 fHitEnd = TRUE;
953 return FALSE;
954 }
955 U16_FWD_1(inputBuf, startPos, fActiveLimit);
956 // Note that it's perfectly OK for a pattern to have a zero-length
957 // match at the end of a string, so we must make sure that the loop
958 // runs with startPos == testLen the last time through.
959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
960 return FALSE;
961 }
962 U_ASSERT(FALSE);
964 case START_START:
965 // Matches are only possible at the start of the input string
966 // (pattern begins with ^ or \A)
967 if (startPos > fActiveStart) {
968 fMatch = FALSE;
969 return FALSE;
970 }
971 MatchChunkAt(startPos, FALSE, fDeferredStatus);
972 if (U_FAILURE(fDeferredStatus)) {
973 return FALSE;
974 }
975 return fMatch;
978 case START_SET:
979 {
980 // Match may start on any char from a pre-computed set.
981 U_ASSERT(fPattern->fMinMatchLen > 0);
982 for (;;) {
983 int32_t pos = startPos;
984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
985 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
986 (c>=256 && fPattern->fInitialChars->contains(c))) {
987 MatchChunkAt(pos, FALSE, fDeferredStatus);
988 if (U_FAILURE(fDeferredStatus)) {
989 return FALSE;
990 }
991 if (fMatch) {
992 return TRUE;
993 }
994 }
995 if (pos >= testLen) {
996 fMatch = FALSE;
997 fHitEnd = TRUE;
998 return FALSE;
999 }
1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1001 return FALSE;
1002 }
1003 }
1004 U_ASSERT(FALSE);
1006 case START_STRING:
1007 case START_CHAR:
1008 {
1009 // Match starts on exactly one char.
1010 U_ASSERT(fPattern->fMinMatchLen > 0);
1011 UChar32 theChar = fPattern->fInitialChar;
1012 for (;;) {
1013 int32_t pos = startPos;
1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1015 if (c == theChar) {
1016 MatchChunkAt(pos, FALSE, fDeferredStatus);
1017 if (U_FAILURE(fDeferredStatus)) {
1018 return FALSE;
1019 }
1020 if (fMatch) {
1021 return TRUE;
1022 }
1023 }
1024 if (pos >= testLen) {
1025 fMatch = FALSE;
1026 fHitEnd = TRUE;
1027 return FALSE;
1028 }
1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1030 return FALSE;
1031 }
1032 }
1033 U_ASSERT(FALSE);
1035 case START_LINE:
1036 {
1037 UChar32 c;
1038 if (startPos == fAnchorStart) {
1039 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1040 if (U_FAILURE(fDeferredStatus)) {
1041 return FALSE;
1042 }
1043 if (fMatch) {
1044 return TRUE;
1045 }
1046 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1047 }
1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1050 for (;;) {
1051 c = inputBuf[startPos-1];
1052 if (c == 0x0a) {
1053 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1054 if (U_FAILURE(fDeferredStatus)) {
1055 return FALSE;
1056 }
1057 if (fMatch) {
1058 return TRUE;
1059 }
1060 }
1061 if (startPos >= testLen) {
1062 fMatch = FALSE;
1063 fHitEnd = TRUE;
1064 return FALSE;
1065 }
1066 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1067 // Note that it's perfectly OK for a pattern to have a zero-length
1068 // match at the end of a string, so we must make sure that the loop
1069 // runs with startPos == testLen the last time through.
1070 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1071 return FALSE;
1072 }
1073 } else {
1074 for (;;) {
1075 c = inputBuf[startPos-1];
1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1079 startPos++;
1080 }
1081 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1082 if (U_FAILURE(fDeferredStatus)) {
1083 return FALSE;
1084 }
1085 if (fMatch) {
1086 return TRUE;
1087 }
1088 }
1089 if (startPos >= testLen) {
1090 fMatch = FALSE;
1091 fHitEnd = TRUE;
1092 return FALSE;
1093 }
1094 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1095 // Note that it's perfectly OK for a pattern to have a zero-length
1096 // match at the end of a string, so we must make sure that the loop
1097 // runs with startPos == testLen the last time through.
1098 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1099 return FALSE;
1100 }
1101 }
1102 }
1104 default:
1105 U_ASSERT(FALSE);
1106 }
1108 U_ASSERT(FALSE);
1109 return FALSE;
1110 }
1114 //--------------------------------------------------------------------------------
1115 //
1116 // group()
1117 //
1118 //--------------------------------------------------------------------------------
1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1120 return group(0, status);
1121 }
1123 // Return immutable shallow clone
1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1125 return group(0, dest, group_len, status);
1126 }
1128 // Return immutable shallow clone
1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1130 group_len = 0;
1131 UBool bailOut = FALSE;
1132 if (U_FAILURE(status)) {
1133 return dest;
1134 }
1135 if (U_FAILURE(fDeferredStatus)) {
1136 status = fDeferredStatus;
1137 bailOut = TRUE;
1138 }
1139 if (fMatch == FALSE) {
1140 status = U_REGEX_INVALID_STATE;
1141 bailOut = TRUE;
1142 }
1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1144 status = U_INDEX_OUTOFBOUNDS_ERROR;
1145 bailOut = TRUE;
1146 }
1148 if (bailOut) {
1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1150 }
1152 int64_t s, e;
1153 if (groupNum == 0) {
1154 s = fMatchStart;
1155 e = fMatchEnd;
1156 } else {
1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1158 U_ASSERT(groupOffset < fPattern->fFrameSize);
1159 U_ASSERT(groupOffset >= 0);
1160 s = fFrame->fExtra[groupOffset];
1161 e = fFrame->fExtra[groupOffset+1];
1162 }
1164 if (s < 0) {
1165 // A capture group wasn't part of the match
1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1167 }
1168 U_ASSERT(s <= e);
1169 group_len = e - s;
1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1172 if (dest)
1173 UTEXT_SETNATIVEINDEX(dest, s);
1174 return dest;
1175 }
1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1178 UnicodeString result;
1179 if (U_FAILURE(status)) {
1180 return result;
1181 }
1182 UText resultText = UTEXT_INITIALIZER;
1183 utext_openUnicodeString(&resultText, &result, &status);
1184 group(groupNum, &resultText, status);
1185 utext_close(&resultText);
1186 return result;
1187 }
1190 // Return deep (mutable) clone
1191 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1192 // using this function.
1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1194 UBool bailOut = FALSE;
1195 if (U_FAILURE(status)) {
1196 return dest;
1197 }
1198 if (U_FAILURE(fDeferredStatus)) {
1199 status = fDeferredStatus;
1200 bailOut = TRUE;
1201 }
1203 if (fMatch == FALSE) {
1204 status = U_REGEX_INVALID_STATE;
1205 bailOut = TRUE;
1206 }
1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1208 status = U_INDEX_OUTOFBOUNDS_ERROR;
1209 bailOut = TRUE;
1210 }
1212 if (bailOut) {
1213 if (dest) {
1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1215 return dest;
1216 } else {
1217 return utext_openUChars(NULL, NULL, 0, &status);
1218 }
1219 }
1221 int64_t s, e;
1222 if (groupNum == 0) {
1223 s = fMatchStart;
1224 e = fMatchEnd;
1225 } else {
1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1227 U_ASSERT(groupOffset < fPattern->fFrameSize);
1228 U_ASSERT(groupOffset >= 0);
1229 s = fFrame->fExtra[groupOffset];
1230 e = fFrame->fExtra[groupOffset+1];
1231 }
1233 if (s < 0) {
1234 // A capture group wasn't part of the match
1235 if (dest) {
1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1237 return dest;
1238 } else {
1239 return utext_openUChars(NULL, NULL, 0, &status);
1240 }
1241 }
1242 U_ASSERT(s <= e);
1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1245 U_ASSERT(e <= fInputLength);
1246 if (dest) {
1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1248 } else {
1249 UText groupText = UTEXT_INITIALIZER;
1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1252 utext_close(&groupText);
1253 }
1254 } else {
1255 int32_t len16;
1256 if (UTEXT_USES_U16(fInputText)) {
1257 len16 = (int32_t)(e-s);
1258 } else {
1259 UErrorCode lengthStatus = U_ZERO_ERROR;
1260 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1261 }
1262 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1263 if (groupChars == NULL) {
1264 status = U_MEMORY_ALLOCATION_ERROR;
1265 return dest;
1266 }
1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1269 if (dest) {
1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1271 } else {
1272 UText groupText = UTEXT_INITIALIZER;
1273 utext_openUChars(&groupText, groupChars, len16, &status);
1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1275 utext_close(&groupText);
1276 }
1278 uprv_free(groupChars);
1279 }
1280 return dest;
1281 }
1283 //--------------------------------------------------------------------------------
1284 //
1285 // appendGroup() -- currently internal only, appends a group to a UText rather
1286 // than replacing its contents
1287 //
1288 //--------------------------------------------------------------------------------
1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1291 if (U_FAILURE(status)) {
1292 return 0;
1293 }
1294 if (U_FAILURE(fDeferredStatus)) {
1295 status = fDeferredStatus;
1296 return 0;
1297 }
1298 int64_t destLen = utext_nativeLength(dest);
1300 if (fMatch == FALSE) {
1301 status = U_REGEX_INVALID_STATE;
1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1303 }
1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1305 status = U_INDEX_OUTOFBOUNDS_ERROR;
1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1307 }
1309 int64_t s, e;
1310 if (groupNum == 0) {
1311 s = fMatchStart;
1312 e = fMatchEnd;
1313 } else {
1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1315 U_ASSERT(groupOffset < fPattern->fFrameSize);
1316 U_ASSERT(groupOffset >= 0);
1317 s = fFrame->fExtra[groupOffset];
1318 e = fFrame->fExtra[groupOffset+1];
1319 }
1321 if (s < 0) {
1322 // A capture group wasn't part of the match
1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1324 }
1325 U_ASSERT(s <= e);
1327 int64_t deltaLen;
1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1329 U_ASSERT(e <= fInputLength);
1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1331 } else {
1332 int32_t len16;
1333 if (UTEXT_USES_U16(fInputText)) {
1334 len16 = (int32_t)(e-s);
1335 } else {
1336 UErrorCode lengthStatus = U_ZERO_ERROR;
1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1338 }
1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1340 if (groupChars == NULL) {
1341 status = U_MEMORY_ALLOCATION_ERROR;
1342 return 0;
1343 }
1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1347 uprv_free(groupChars);
1348 }
1349 return deltaLen;
1350 }
1354 //--------------------------------------------------------------------------------
1355 //
1356 // groupCount()
1357 //
1358 //--------------------------------------------------------------------------------
1359 int32_t RegexMatcher::groupCount() const {
1360 return fPattern->fGroupMap->size();
1361 }
1365 //--------------------------------------------------------------------------------
1366 //
1367 // hasAnchoringBounds()
1368 //
1369 //--------------------------------------------------------------------------------
1370 UBool RegexMatcher::hasAnchoringBounds() const {
1371 return fAnchoringBounds;
1372 }
1375 //--------------------------------------------------------------------------------
1376 //
1377 // hasTransparentBounds()
1378 //
1379 //--------------------------------------------------------------------------------
1380 UBool RegexMatcher::hasTransparentBounds() const {
1381 return fTransparentBounds;
1382 }
1386 //--------------------------------------------------------------------------------
1387 //
1388 // hitEnd()
1389 //
1390 //--------------------------------------------------------------------------------
1391 UBool RegexMatcher::hitEnd() const {
1392 return fHitEnd;
1393 }
1396 //--------------------------------------------------------------------------------
1397 //
1398 // input()
1399 //
1400 //--------------------------------------------------------------------------------
1401 const UnicodeString &RegexMatcher::input() const {
1402 if (!fInput) {
1403 UErrorCode status = U_ZERO_ERROR;
1404 int32_t len16;
1405 if (UTEXT_USES_U16(fInputText)) {
1406 len16 = (int32_t)fInputLength;
1407 } else {
1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1409 status = U_ZERO_ERROR; // overflow, length status
1410 }
1411 UnicodeString *result = new UnicodeString(len16, 0, 0);
1413 UChar *inputChars = result->getBuffer(len16);
1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1415 result->releaseBuffer(len16);
1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1418 }
1420 return *fInput;
1421 }
1423 //--------------------------------------------------------------------------------
1424 //
1425 // inputText()
1426 //
1427 //--------------------------------------------------------------------------------
1428 UText *RegexMatcher::inputText() const {
1429 return fInputText;
1430 }
1433 //--------------------------------------------------------------------------------
1434 //
1435 // getInput() -- like inputText(), but makes a clone or copies into another UText
1436 //
1437 //--------------------------------------------------------------------------------
1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1439 UBool bailOut = FALSE;
1440 if (U_FAILURE(status)) {
1441 return dest;
1442 }
1443 if (U_FAILURE(fDeferredStatus)) {
1444 status = fDeferredStatus;
1445 bailOut = TRUE;
1446 }
1448 if (bailOut) {
1449 if (dest) {
1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1451 return dest;
1452 } else {
1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1454 }
1455 }
1457 if (dest) {
1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1460 } else {
1461 int32_t input16Len;
1462 if (UTEXT_USES_U16(fInputText)) {
1463 input16Len = (int32_t)fInputLength;
1464 } else {
1465 UErrorCode lengthStatus = U_ZERO_ERROR;
1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1467 }
1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1469 if (inputChars == NULL) {
1470 return dest;
1471 }
1473 status = U_ZERO_ERROR;
1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1475 status = U_ZERO_ERROR;
1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1478 uprv_free(inputChars);
1479 }
1480 return dest;
1481 } else {
1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1483 }
1484 }
1487 static UBool compat_SyncMutableUTextContents(UText *ut);
1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
1489 UBool retVal = FALSE;
1491 // In the following test, we're really only interested in whether the UText should switch
1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1493 // will still point to the correct data.
1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1495 UnicodeString *us=(UnicodeString *)ut->context;
1497 // Update to the latest length.
1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499 int32_t newLength = us->length();
1501 // Update the chunk description.
1502 // The buffer may have switched between stack- and heap-based.
1503 ut->chunkContents = us->getBuffer();
1504 ut->chunkLength = newLength;
1505 ut->chunkNativeLimit = newLength;
1506 ut->nativeIndexingLimit = newLength;
1507 retVal = TRUE;
1508 }
1510 return retVal;
1511 }
1513 //--------------------------------------------------------------------------------
1514 //
1515 // lookingAt()
1516 //
1517 //--------------------------------------------------------------------------------
1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1519 if (U_FAILURE(status)) {
1520 return FALSE;
1521 }
1522 if (U_FAILURE(fDeferredStatus)) {
1523 status = fDeferredStatus;
1524 return FALSE;
1525 }
1527 if (fInputUniStrMaybeMutable) {
1528 if (compat_SyncMutableUTextContents(fInputText)) {
1529 fInputLength = utext_nativeLength(fInputText);
1530 reset();
1531 }
1532 }
1533 else {
1534 resetPreserveRegion();
1535 }
1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1538 } else {
1539 MatchAt(fActiveStart, FALSE, status);
1540 }
1541 return fMatch;
1542 }
1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1546 if (U_FAILURE(status)) {
1547 return FALSE;
1548 }
1549 if (U_FAILURE(fDeferredStatus)) {
1550 status = fDeferredStatus;
1551 return FALSE;
1552 }
1553 reset();
1555 if (start < 0) {
1556 status = U_INDEX_OUTOFBOUNDS_ERROR;
1557 return FALSE;
1558 }
1560 if (fInputUniStrMaybeMutable) {
1561 if (compat_SyncMutableUTextContents(fInputText)) {
1562 fInputLength = utext_nativeLength(fInputText);
1563 reset();
1564 }
1565 }
1567 int64_t nativeStart;
1568 nativeStart = start;
1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1570 status = U_INDEX_OUTOFBOUNDS_ERROR;
1571 return FALSE;
1572 }
1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1575 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1576 } else {
1577 MatchAt(nativeStart, FALSE, status);
1578 }
1579 return fMatch;
1580 }
1584 //--------------------------------------------------------------------------------
1585 //
1586 // matches()
1587 //
1588 //--------------------------------------------------------------------------------
1589 UBool RegexMatcher::matches(UErrorCode &status) {
1590 if (U_FAILURE(status)) {
1591 return FALSE;
1592 }
1593 if (U_FAILURE(fDeferredStatus)) {
1594 status = fDeferredStatus;
1595 return FALSE;
1596 }
1598 if (fInputUniStrMaybeMutable) {
1599 if (compat_SyncMutableUTextContents(fInputText)) {
1600 fInputLength = utext_nativeLength(fInputText);
1601 reset();
1602 }
1603 }
1604 else {
1605 resetPreserveRegion();
1606 }
1608 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1609 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1610 } else {
1611 MatchAt(fActiveStart, TRUE, status);
1612 }
1613 return fMatch;
1614 }
1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1618 if (U_FAILURE(status)) {
1619 return FALSE;
1620 }
1621 if (U_FAILURE(fDeferredStatus)) {
1622 status = fDeferredStatus;
1623 return FALSE;
1624 }
1625 reset();
1627 if (start < 0) {
1628 status = U_INDEX_OUTOFBOUNDS_ERROR;
1629 return FALSE;
1630 }
1632 if (fInputUniStrMaybeMutable) {
1633 if (compat_SyncMutableUTextContents(fInputText)) {
1634 fInputLength = utext_nativeLength(fInputText);
1635 reset();
1636 }
1637 }
1639 int64_t nativeStart;
1640 nativeStart = start;
1641 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1642 status = U_INDEX_OUTOFBOUNDS_ERROR;
1643 return FALSE;
1644 }
1646 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1647 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1648 } else {
1649 MatchAt(nativeStart, TRUE, status);
1650 }
1651 return fMatch;
1652 }
1656 //--------------------------------------------------------------------------------
1657 //
1658 // pattern
1659 //
1660 //--------------------------------------------------------------------------------
1661 const RegexPattern &RegexMatcher::pattern() const {
1662 return *fPattern;
1663 }
1667 //--------------------------------------------------------------------------------
1668 //
1669 // region
1670 //
1671 //--------------------------------------------------------------------------------
1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1673 if (U_FAILURE(status)) {
1674 return *this;
1675 }
1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 }
1681 int64_t nativeStart = regionStart;
1682 int64_t nativeLimit = regionLimit;
1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1684 status = U_ILLEGAL_ARGUMENT_ERROR;
1685 }
1687 if (startIndex == -1)
1688 this->reset();
1689 else
1690 resetPreserveRegion();
1692 fRegionStart = nativeStart;
1693 fRegionLimit = nativeLimit;
1694 fActiveStart = nativeStart;
1695 fActiveLimit = nativeLimit;
1697 if (startIndex != -1) {
1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1699 status = U_INDEX_OUTOFBOUNDS_ERROR;
1700 }
1701 fMatchEnd = startIndex;
1702 }
1704 if (!fTransparentBounds) {
1705 fLookStart = nativeStart;
1706 fLookLimit = nativeLimit;
1707 }
1708 if (fAnchoringBounds) {
1709 fAnchorStart = nativeStart;
1710 fAnchorLimit = nativeLimit;
1711 }
1712 return *this;
1713 }
1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1716 return region(start, limit, -1, status);
1717 }
1719 //--------------------------------------------------------------------------------
1720 //
1721 // regionEnd
1722 //
1723 //--------------------------------------------------------------------------------
1724 int32_t RegexMatcher::regionEnd() const {
1725 return (int32_t)fRegionLimit;
1726 }
1728 int64_t RegexMatcher::regionEnd64() const {
1729 return fRegionLimit;
1730 }
1732 //--------------------------------------------------------------------------------
1733 //
1734 // regionStart
1735 //
1736 //--------------------------------------------------------------------------------
1737 int32_t RegexMatcher::regionStart() const {
1738 return (int32_t)fRegionStart;
1739 }
1741 int64_t RegexMatcher::regionStart64() const {
1742 return fRegionStart;
1743 }
1746 //--------------------------------------------------------------------------------
1747 //
1748 // replaceAll
1749 //
1750 //--------------------------------------------------------------------------------
1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1752 UText replacementText = UTEXT_INITIALIZER;
1753 UText resultText = UTEXT_INITIALIZER;
1754 UnicodeString resultString;
1755 if (U_FAILURE(status)) {
1756 return resultString;
1757 }
1759 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1760 utext_openUnicodeString(&resultText, &resultString, &status);
1762 replaceAll(&replacementText, &resultText, status);
1764 utext_close(&resultText);
1765 utext_close(&replacementText);
1767 return resultString;
1768 }
1771 //
1772 // replaceAll, UText mode
1773 //
1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1775 if (U_FAILURE(status)) {
1776 return dest;
1777 }
1778 if (U_FAILURE(fDeferredStatus)) {
1779 status = fDeferredStatus;
1780 return dest;
1781 }
1783 if (dest == NULL) {
1784 UnicodeString emptyString;
1785 UText empty = UTEXT_INITIALIZER;
1787 utext_openUnicodeString(&empty, &emptyString, &status);
1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1789 utext_close(&empty);
1790 }
1792 if (U_SUCCESS(status)) {
1793 reset();
1794 while (find()) {
1795 appendReplacement(dest, replacement, status);
1796 if (U_FAILURE(status)) {
1797 break;
1798 }
1799 }
1800 appendTail(dest, status);
1801 }
1803 return dest;
1804 }
1807 //--------------------------------------------------------------------------------
1808 //
1809 // replaceFirst
1810 //
1811 //--------------------------------------------------------------------------------
1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1813 UText replacementText = UTEXT_INITIALIZER;
1814 UText resultText = UTEXT_INITIALIZER;
1815 UnicodeString resultString;
1817 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1818 utext_openUnicodeString(&resultText, &resultString, &status);
1820 replaceFirst(&replacementText, &resultText, status);
1822 utext_close(&resultText);
1823 utext_close(&replacementText);
1825 return resultString;
1826 }
1828 //
1829 // replaceFirst, UText mode
1830 //
1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1832 if (U_FAILURE(status)) {
1833 return dest;
1834 }
1835 if (U_FAILURE(fDeferredStatus)) {
1836 status = fDeferredStatus;
1837 return dest;
1838 }
1840 reset();
1841 if (!find()) {
1842 return getInput(dest, status);
1843 }
1845 if (dest == NULL) {
1846 UnicodeString emptyString;
1847 UText empty = UTEXT_INITIALIZER;
1849 utext_openUnicodeString(&empty, &emptyString, &status);
1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1851 utext_close(&empty);
1852 }
1854 appendReplacement(dest, replacement, status);
1855 appendTail(dest, status);
1857 return dest;
1858 }
1861 //--------------------------------------------------------------------------------
1862 //
1863 // requireEnd
1864 //
1865 //--------------------------------------------------------------------------------
1866 UBool RegexMatcher::requireEnd() const {
1867 return fRequireEnd;
1868 }
1871 //--------------------------------------------------------------------------------
1872 //
1873 // reset
1874 //
1875 //--------------------------------------------------------------------------------
1876 RegexMatcher &RegexMatcher::reset() {
1877 fRegionStart = 0;
1878 fRegionLimit = fInputLength;
1879 fActiveStart = 0;
1880 fActiveLimit = fInputLength;
1881 fAnchorStart = 0;
1882 fAnchorLimit = fInputLength;
1883 fLookStart = 0;
1884 fLookLimit = fInputLength;
1885 resetPreserveRegion();
1886 return *this;
1887 }
1891 void RegexMatcher::resetPreserveRegion() {
1892 fMatchStart = 0;
1893 fMatchEnd = 0;
1894 fLastMatchEnd = -1;
1895 fAppendPosition = 0;
1896 fMatch = FALSE;
1897 fHitEnd = FALSE;
1898 fRequireEnd = FALSE;
1899 fTime = 0;
1900 fTickCounter = TIMER_INITIAL_VALUE;
1901 //resetStack(); // more expensive than it looks...
1902 }
1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1907 if (fPattern->fNeedsAltInput) {
1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1909 }
1910 fInputLength = utext_nativeLength(fInputText);
1912 reset();
1913 delete fInput;
1914 fInput = NULL;
1916 // Do the following for any UnicodeString.
1917 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1918 fInputUniStrMaybeMutable = TRUE;
1920 if (fWordBreakItr != NULL) {
1921 #if UCONFIG_NO_BREAK_ITERATION==0
1922 UErrorCode status = U_ZERO_ERROR;
1923 fWordBreakItr->setText(fInputText, status);
1924 #endif
1925 }
1926 return *this;
1927 }
1930 RegexMatcher &RegexMatcher::reset(UText *input) {
1931 if (fInputText != input) {
1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1934 fInputLength = utext_nativeLength(fInputText);
1936 delete fInput;
1937 fInput = NULL;
1939 if (fWordBreakItr != NULL) {
1940 #if UCONFIG_NO_BREAK_ITERATION==0
1941 UErrorCode status = U_ZERO_ERROR;
1942 fWordBreakItr->setText(input, status);
1943 #endif
1944 }
1945 }
1946 reset();
1947 fInputUniStrMaybeMutable = FALSE;
1949 return *this;
1950 }
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1954 return *this;
1955 }*/
1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1958 if (U_FAILURE(status)) {
1959 return *this;
1960 }
1961 reset(); // Reset also resets the region to be the entire string.
1963 if (position < 0 || position > fActiveLimit) {
1964 status = U_INDEX_OUTOFBOUNDS_ERROR;
1965 return *this;
1966 }
1967 fMatchEnd = position;
1968 return *this;
1969 }
1972 //--------------------------------------------------------------------------------
1973 //
1974 // refresh
1975 //
1976 //--------------------------------------------------------------------------------
1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1978 if (U_FAILURE(status)) {
1979 return *this;
1980 }
1981 if (input == NULL) {
1982 status = U_ILLEGAL_ARGUMENT_ERROR;
1983 return *this;
1984 }
1985 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1986 status = U_ILLEGAL_ARGUMENT_ERROR;
1987 return *this;
1988 }
1989 int64_t pos = utext_getNativeIndex(fInputText);
1990 // Shallow read-only clone of the new UText into the existing input UText
1991 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1992 if (U_FAILURE(status)) {
1993 return *this;
1994 }
1995 utext_setNativeIndex(fInputText, pos);
1997 if (fAltInputText != NULL) {
1998 pos = utext_getNativeIndex(fAltInputText);
1999 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
2000 if (U_FAILURE(status)) {
2001 return *this;
2002 }
2003 utext_setNativeIndex(fAltInputText, pos);
2004 }
2005 return *this;
2006 }
2010 //--------------------------------------------------------------------------------
2011 //
2012 // setTrace
2013 //
2014 //--------------------------------------------------------------------------------
2015 void RegexMatcher::setTrace(UBool state) {
2016 fTraceDebug = state;
2017 }
2021 //---------------------------------------------------------------------
2022 //
2023 // split
2024 //
2025 //---------------------------------------------------------------------
2026 int32_t RegexMatcher::split(const UnicodeString &input,
2027 UnicodeString dest[],
2028 int32_t destCapacity,
2029 UErrorCode &status)
2030 {
2031 UText inputText = UTEXT_INITIALIZER;
2032 utext_openConstUnicodeString(&inputText, &input, &status);
2033 if (U_FAILURE(status)) {
2034 return 0;
2035 }
2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2038 if (destText == NULL) {
2039 status = U_MEMORY_ALLOCATION_ERROR;
2040 return 0;
2041 }
2042 int32_t i;
2043 for (i = 0; i < destCapacity; i++) {
2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2045 }
2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2049 for (i = 0; i < destCapacity; i++) {
2050 utext_close(destText[i]);
2051 }
2053 uprv_free(destText);
2054 utext_close(&inputText);
2055 return fieldCount;
2056 }
2058 //
2059 // split, UText mode
2060 //
2061 int32_t RegexMatcher::split(UText *input,
2062 UText *dest[],
2063 int32_t destCapacity,
2064 UErrorCode &status)
2065 {
2066 //
2067 // Check arguements for validity
2068 //
2069 if (U_FAILURE(status)) {
2070 return 0;
2071 };
2073 if (destCapacity < 1) {
2074 status = U_ILLEGAL_ARGUMENT_ERROR;
2075 return 0;
2076 }
2078 //
2079 // Reset for the input text
2080 //
2081 reset(input);
2082 int64_t nextOutputStringStart = 0;
2083 if (fActiveLimit == 0) {
2084 return 0;
2085 }
2087 //
2088 // Loop through the input text, searching for the delimiter pattern
2089 //
2090 int32_t i;
2091 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2092 for (i=0; ; i++) {
2093 if (i>=destCapacity-1) {
2094 // There is one or zero output string left.
2095 // Fill the last output string with whatever is left from the input, then exit the loop.
2096 // ( i will be == destCapacity if we filled the output array while processing
2097 // capture groups of the delimiter expression, in which case we will discard the
2098 // last capture group saved in favor of the unprocessed remainder of the
2099 // input string.)
2100 i = destCapacity-1;
2101 if (fActiveLimit > nextOutputStringStart) {
2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2103 if (dest[i]) {
2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2105 input->chunkContents+nextOutputStringStart,
2106 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2107 } else {
2108 UText remainingText = UTEXT_INITIALIZER;
2109 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2110 fActiveLimit-nextOutputStringStart, &status);
2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2112 utext_close(&remainingText);
2113 }
2114 } else {
2115 UErrorCode lengthStatus = U_ZERO_ERROR;
2116 int32_t remaining16Length =
2117 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2119 if (remainingChars == NULL) {
2120 status = U_MEMORY_ALLOCATION_ERROR;
2121 break;
2122 }
2124 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2125 if (dest[i]) {
2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2127 } else {
2128 UText remainingText = UTEXT_INITIALIZER;
2129 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2131 utext_close(&remainingText);
2132 }
2134 uprv_free(remainingChars);
2135 }
2136 }
2137 break;
2138 }
2139 if (find()) {
2140 // We found another delimiter. Move everything from where we started looking
2141 // up until the start of the delimiter into the next output string.
2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2143 if (dest[i]) {
2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2145 input->chunkContents+nextOutputStringStart,
2146 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2147 } else {
2148 UText remainingText = UTEXT_INITIALIZER;
2149 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2150 fMatchStart-nextOutputStringStart, &status);
2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2152 utext_close(&remainingText);
2153 }
2154 } else {
2155 UErrorCode lengthStatus = U_ZERO_ERROR;
2156 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2158 if (remainingChars == NULL) {
2159 status = U_MEMORY_ALLOCATION_ERROR;
2160 break;
2161 }
2162 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2163 if (dest[i]) {
2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2165 } else {
2166 UText remainingText = UTEXT_INITIALIZER;
2167 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2169 utext_close(&remainingText);
2170 }
2172 uprv_free(remainingChars);
2173 }
2174 nextOutputStringStart = fMatchEnd;
2176 // If the delimiter pattern has capturing parentheses, the captured
2177 // text goes out into the next n destination strings.
2178 int32_t groupNum;
2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2180 if (i >= destCapacity-2) {
2181 // Never fill the last available output string with capture group text.
2182 // It will filled with the last field, the remainder of the
2183 // unsplit input text.
2184 break;
2185 }
2186 i++;
2187 dest[i] = group(groupNum, dest[i], status);
2188 }
2190 if (nextOutputStringStart == fActiveLimit) {
2191 // The delimiter was at the end of the string. We're done, but first
2192 // we output one last empty string, for the empty field following
2193 // the delimiter at the end of input.
2194 if (i+1 < destCapacity) {
2195 ++i;
2196 if (dest[i] == NULL) {
2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198 } else {
2199 static UChar emptyString[] = {(UChar)0};
2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201 }
2202 }
2203 break;
2205 }
2206 }
2207 else
2208 {
2209 // We ran off the end of the input while looking for the next delimiter.
2210 // All the remaining text goes into the current output string.
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212 if (dest[i]) {
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214 input->chunkContents+nextOutputStringStart,
2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216 } else {
2217 UText remainingText = UTEXT_INITIALIZER;
2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219 fActiveLimit-nextOutputStringStart, &status);
2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221 utext_close(&remainingText);
2222 }
2223 } else {
2224 UErrorCode lengthStatus = U_ZERO_ERROR;
2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227 if (remainingChars == NULL) {
2228 status = U_MEMORY_ALLOCATION_ERROR;
2229 break;
2230 }
2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233 if (dest[i]) {
2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235 } else {
2236 UText remainingText = UTEXT_INITIALIZER;
2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239 utext_close(&remainingText);
2240 }
2242 uprv_free(remainingChars);
2243 }
2244 break;
2245 }
2246 if (U_FAILURE(status)) {
2247 break;
2248 }
2249 } // end of for loop
2250 return i+1;
2251 }
2254 //--------------------------------------------------------------------------------
2255 //
2256 // start
2257 //
2258 //--------------------------------------------------------------------------------
2259 int32_t RegexMatcher::start(UErrorCode &status) const {
2260 return start(0, status);
2261 }
2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
2264 return start64(0, status);
2265 }
2267 //--------------------------------------------------------------------------------
2268 //
2269 // start(int32_t group, UErrorCode &status)
2270 //
2271 //--------------------------------------------------------------------------------
2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274 if (U_FAILURE(status)) {
2275 return -1;
2276 }
2277 if (U_FAILURE(fDeferredStatus)) {
2278 status = fDeferredStatus;
2279 return -1;
2280 }
2281 if (fMatch == FALSE) {
2282 status = U_REGEX_INVALID_STATE;
2283 return -1;
2284 }
2285 if (group < 0 || group > fPattern->fGroupMap->size()) {
2286 status = U_INDEX_OUTOFBOUNDS_ERROR;
2287 return -1;
2288 }
2289 int64_t s;
2290 if (group == 0) {
2291 s = fMatchStart;
2292 } else {
2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294 U_ASSERT(groupOffset < fPattern->fFrameSize);
2295 U_ASSERT(groupOffset >= 0);
2296 s = fFrame->fExtra[groupOffset];
2297 }
2299 return s;
2300 }
2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304 return (int32_t)start64(group, status);
2305 }
2307 //--------------------------------------------------------------------------------
2308 //
2309 // useAnchoringBounds
2310 //
2311 //--------------------------------------------------------------------------------
2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313 fAnchoringBounds = b;
2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316 return *this;
2317 }
2320 //--------------------------------------------------------------------------------
2321 //
2322 // useTransparentBounds
2323 //
2324 //--------------------------------------------------------------------------------
2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326 fTransparentBounds = b;
2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329 return *this;
2330 }
2332 //--------------------------------------------------------------------------------
2333 //
2334 // setTimeLimit
2335 //
2336 //--------------------------------------------------------------------------------
2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338 if (U_FAILURE(status)) {
2339 return;
2340 }
2341 if (U_FAILURE(fDeferredStatus)) {
2342 status = fDeferredStatus;
2343 return;
2344 }
2345 if (limit < 0) {
2346 status = U_ILLEGAL_ARGUMENT_ERROR;
2347 return;
2348 }
2349 fTimeLimit = limit;
2350 }
2353 //--------------------------------------------------------------------------------
2354 //
2355 // getTimeLimit
2356 //
2357 //--------------------------------------------------------------------------------
2358 int32_t RegexMatcher::getTimeLimit() const {
2359 return fTimeLimit;
2360 }
2363 //--------------------------------------------------------------------------------
2364 //
2365 // setStackLimit
2366 //
2367 //--------------------------------------------------------------------------------
2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369 if (U_FAILURE(status)) {
2370 return;
2371 }
2372 if (U_FAILURE(fDeferredStatus)) {
2373 status = fDeferredStatus;
2374 return;
2375 }
2376 if (limit < 0) {
2377 status = U_ILLEGAL_ARGUMENT_ERROR;
2378 return;
2379 }
2381 // Reset the matcher. This is needed here in case there is a current match
2382 // whose final stack frame (containing the match results, pointed to by fFrame)
2383 // would be lost by resizing to a smaller stack size.
2384 reset();
2386 if (limit == 0) {
2387 // Unlimited stack expansion
2388 fStack->setMaxCapacity(0);
2389 } else {
2390 // Change the units of the limit from bytes to ints, and bump the size up
2391 // to be big enough to hold at least one stack frame for the pattern,
2392 // if it isn't there already.
2393 int32_t adjustedLimit = limit / sizeof(int32_t);
2394 if (adjustedLimit < fPattern->fFrameSize) {
2395 adjustedLimit = fPattern->fFrameSize;
2396 }
2397 fStack->setMaxCapacity(adjustedLimit);
2398 }
2399 fStackLimit = limit;
2400 }
2403 //--------------------------------------------------------------------------------
2404 //
2405 // getStackLimit
2406 //
2407 //--------------------------------------------------------------------------------
2408 int32_t RegexMatcher::getStackLimit() const {
2409 return fStackLimit;
2410 }
2413 //--------------------------------------------------------------------------------
2414 //
2415 // setMatchCallback
2416 //
2417 //--------------------------------------------------------------------------------
2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2419 const void *context,
2420 UErrorCode &status) {
2421 if (U_FAILURE(status)) {
2422 return;
2423 }
2424 fCallbackFn = callback;
2425 fCallbackContext = context;
2426 }
2429 //--------------------------------------------------------------------------------
2430 //
2431 // getMatchCallback
2432 //
2433 //--------------------------------------------------------------------------------
2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2435 const void *&context,
2436 UErrorCode &status) {
2437 if (U_FAILURE(status)) {
2438 return;
2439 }
2440 callback = fCallbackFn;
2441 context = fCallbackContext;
2442 }
2445 //--------------------------------------------------------------------------------
2446 //
2447 // setMatchCallback
2448 //
2449 //--------------------------------------------------------------------------------
2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2451 const void *context,
2452 UErrorCode &status) {
2453 if (U_FAILURE(status)) {
2454 return;
2455 }
2456 fFindProgressCallbackFn = callback;
2457 fFindProgressCallbackContext = context;
2458 }
2461 //--------------------------------------------------------------------------------
2462 //
2463 // getMatchCallback
2464 //
2465 //--------------------------------------------------------------------------------
2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2467 const void *&context,
2468 UErrorCode &status) {
2469 if (U_FAILURE(status)) {
2470 return;
2471 }
2472 callback = fFindProgressCallbackFn;
2473 context = fFindProgressCallbackContext;
2474 }
2477 //================================================================================
2478 //
2479 // Code following this point in this file is the internal
2480 // Match Engine Implementation.
2481 //
2482 //================================================================================
2485 //--------------------------------------------------------------------------------
2486 //
2487 // resetStack
2488 // Discard any previous contents of the state save stack, and initialize a
2489 // new stack frame to all -1. The -1s are needed for capture group limits,
2490 // where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
2492 REStackFrame *RegexMatcher::resetStack() {
2493 // Discard any previous contents of the state save stack, and initialize a
2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2495 // where they indicate that a group has not yet matched anything.
2496 fStack->removeAllElements();
2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499 int32_t i;
2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2501 iFrame->fExtra[i] = -1;
2502 }
2503 return iFrame;
2504 }
2508 //--------------------------------------------------------------------------------
2509 //
2510 // isWordBoundary
2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2512 // For us,
2513 // If the current char is a combining mark,
2514 // \b is FALSE.
2515 // Else Scan backwards to the first non-combining char.
2516 // We are at a boundary if the this char and the original chars are
2517 // opposite in membership in \w set
2518 //
2519 // parameters: pos - the current position in the input buffer
2520 //
2521 // TODO: double-check edge cases at region boundaries.
2522 //
2523 //--------------------------------------------------------------------------------
2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2525 UBool isBoundary = FALSE;
2526 UBool cIsWord = FALSE;
2528 if (pos >= fLookLimit) {
2529 fHitEnd = TRUE;
2530 } else {
2531 // Determine whether char c at current position is a member of the word set of chars.
2532 // If we're off the end of the string, behave as though we're not at a word char.
2533 UTEXT_SETNATIVEINDEX(fInputText, pos);
2534 UChar32 c = UTEXT_CURRENT32(fInputText);
2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2536 // Current char is a combining one. Not a boundary.
2537 return FALSE;
2538 }
2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2540 }
2542 // Back up until we come to a non-combining char, determine whether
2543 // that char is a word char.
2544 UBool prevCIsWord = FALSE;
2545 for (;;) {
2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2547 break;
2548 }
2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2551 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2553 break;
2554 }
2555 }
2556 isBoundary = cIsWord ^ prevCIsWord;
2557 return isBoundary;
2558 }
2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2561 UBool isBoundary = FALSE;
2562 UBool cIsWord = FALSE;
2564 const UChar *inputBuf = fInputText->chunkContents;
2566 if (pos >= fLookLimit) {
2567 fHitEnd = TRUE;
2568 } else {
2569 // Determine whether char c at current position is a member of the word set of chars.
2570 // If we're off the end of the string, behave as though we're not at a word char.
2571 UChar32 c;
2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2574 // Current char is a combining one. Not a boundary.
2575 return FALSE;
2576 }
2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2578 }
2580 // Back up until we come to a non-combining char, determine whether
2581 // that char is a word char.
2582 UBool prevCIsWord = FALSE;
2583 for (;;) {
2584 if (pos <= fLookStart) {
2585 break;
2586 }
2587 UChar32 prevChar;
2588 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2590 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2592 break;
2593 }
2594 }
2595 isBoundary = cIsWord ^ prevCIsWord;
2596 return isBoundary;
2597 }
2599 //--------------------------------------------------------------------------------
2600 //
2601 // isUWordBoundary
2602 //
2603 // Test for a word boundary using RBBI word break.
2604 //
2605 // parameters: pos - the current position in the input buffer
2606 //
2607 //--------------------------------------------------------------------------------
2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2609 UBool returnVal = FALSE;
2610 #if UCONFIG_NO_BREAK_ITERATION==0
2612 // If we haven't yet created a break iterator for this matcher, do it now.
2613 if (fWordBreakItr == NULL) {
2614 fWordBreakItr =
2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2616 if (U_FAILURE(fDeferredStatus)) {
2617 return FALSE;
2618 }
2619 fWordBreakItr->setText(fInputText, fDeferredStatus);
2620 }
2622 if (pos >= fLookLimit) {
2623 fHitEnd = TRUE;
2624 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2625 // words are not boundaries. All non-word chars stand by themselves,
2626 // with word boundaries on both sides.
2627 } else {
2628 if (!UTEXT_USES_U16(fInputText)) {
2629 // !!!: Would like a better way to do this!
2630 UErrorCode status = U_ZERO_ERROR;
2631 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2632 }
2633 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2634 }
2635 #endif
2636 return returnVal;
2637 }
2639 //--------------------------------------------------------------------------------
2640 //
2641 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2642 // saves. Increment the "time" counter, and call the
2643 // user callback function if there is one installed.
2644 //
2645 // If the match operation needs to be aborted, either for a time-out
2646 // or because the user callback asked for it, just set an error status.
2647 // The engine will pick that up and stop in its outer loop.
2648 //
2649 //--------------------------------------------------------------------------------
2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
2651 fTickCounter = TIMER_INITIAL_VALUE;
2652 fTime++;
2653 if (fCallbackFn != NULL) {
2654 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2655 status = U_REGEX_STOPPED_BY_CALLER;
2656 return;
2657 }
2658 }
2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2660 status = U_REGEX_TIME_OUT;
2661 }
2662 }
2664 //--------------------------------------------------------------------------------
2665 //
2666 // ReportFindProgress This function is called once for each advance in the target
2667 // string from the find() function, and calls the user progress callback
2668 // function if there is one installed.
2669 //
2670 // NOTE:
2671 //
2672 // If the match operation needs to be aborted because the user
2673 // callback asked for it, just set an error status.
2674 // The engine will pick that up and stop in its outer loop.
2675 //
2676 //--------------------------------------------------------------------------------
2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2678 if (fFindProgressCallbackFn != NULL) {
2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2681 return FALSE;
2682 }
2683 }
2684 return TRUE;
2685 }
2687 //--------------------------------------------------------------------------------
2688 //
2689 // StateSave
2690 // Make a new stack frame, initialized as a copy of the current stack frame.
2691 // Set the pattern index in the original stack frame from the operand value
2692 // in the opcode. Execution of the engine continues with the state in
2693 // the newly created stack frame
2694 //
2695 // Note that reserveBlock() may grow the stack, resulting in the
2696 // whole thing being relocated in memory.
2697 //
2698 // Parameters:
2699 // fp The top frame pointer when called. At return, a new
2700 // fame will be present
2701 // savePatIdx An index into the compiled pattern. Goes into the original
2702 // (not new) frame. If execution ever back-tracks out of the
2703 // new frame, this will be where we continue from in the pattern.
2704 // Return
2705 // The new frame pointer.
2706 //
2707 //--------------------------------------------------------------------------------
2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2709 // push storage for a new frame.
2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2711 if (newFP == NULL) {
2712 // Failure on attempted stack expansion.
2713 // Stack function set some other error code, change it to a more
2714 // specific one for regular expressions.
2715 status = U_REGEX_STACK_OVERFLOW;
2716 // We need to return a writable stack frame, so just return the
2717 // previous frame. The match operation will stop quickly
2718 // because of the error status, after which the frame will never
2719 // be looked at again.
2720 return fp;
2721 }
2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2724 // New stack frame = copy of old top frame.
2725 int64_t *source = (int64_t *)fp;
2726 int64_t *dest = newFP;
2727 for (;;) {
2728 *dest++ = *source++;
2729 if (source == newFP) {
2730 break;
2731 }
2732 }
2734 fTickCounter--;
2735 if (fTickCounter <= 0) {
2736 IncrementTime(status); // Re-initializes fTickCounter
2737 }
2738 fp->fPatIdx = savePatIdx;
2739 return (REStackFrame *)newFP;
2740 }
2743 //--------------------------------------------------------------------------------
2744 //
2745 // MatchAt This is the actual matching engine.
2746 //
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match.
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2756 int32_t op; // Operation from the compiled pattern, split into
2757 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value.
2760 #ifdef REGEX_RUN_DEBUG
2761 if (fTraceDebug)
2762 {
2763 printf("MatchAt(startIdx=%ld)\n", startIdx);
2764 printf("Original Pattern: ");
2765 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2766 while (c != U_SENTINEL) {
2767 if (c<32 || c>256) {
2768 c = '.';
2769 }
2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2772 c = UTEXT_NEXT32(fPattern->fPattern);
2773 }
2774 printf("\n");
2775 printf("Input String: ");
2776 c = utext_next32From(fInputText, 0);
2777 while (c != U_SENTINEL) {
2778 if (c<32 || c>256) {
2779 c = '.';
2780 }
2781 printf("%c", c);
2783 c = UTEXT_NEXT32(fInputText);
2784 }
2785 printf("\n");
2786 printf("\n");
2787 }
2788 #endif
2790 if (U_FAILURE(status)) {
2791 return;
2792 }
2794 // Cache frequently referenced items from the compiled pattern
2795 //
2796 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2798 const UChar *litText = fPattern->fLiteralText.getBuffer();
2799 UVector *sets = fPattern->fSets;
2801 fFrameSize = fPattern->fFrameSize;
2802 REStackFrame *fp = resetStack();
2804 fp->fPatIdx = 0;
2805 fp->fInputIdx = startIdx;
2807 // Zero out the pattern's static data
2808 int32_t i;
2809 for (i = 0; i<fPattern->fDataSize; i++) {
2810 fData[i] = 0;
2811 }
2813 //
2814 // Main loop for interpreting the compiled pattern.
2815 // One iteration of the loop per pattern operation performed.
2816 //
2817 for (;;) {
2818 #if 0
2819 if (_heapchk() != _HEAPOK) {
2820 fprintf(stderr, "Heap Trouble\n");
2821 }
2822 #endif
2824 op = (int32_t)pat[fp->fPatIdx];
2825 opType = URX_TYPE(op);
2826 opValue = URX_VAL(op);
2827 #ifdef REGEX_RUN_DEBUG
2828 if (fTraceDebug) {
2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2832 fPattern->dumpOp(fp->fPatIdx);
2833 }
2834 #endif
2835 fp->fPatIdx++;
2837 switch (opType) {
2840 case URX_NOP:
2841 break;
2844 case URX_BACKTRACK:
2845 // Force a backtrack. In some circumstances, the pattern compiler
2846 // will notice that the pattern can't possibly match anything, and will
2847 // emit one of these at that point.
2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849 break;
2852 case URX_ONECHAR:
2853 if (fp->fInputIdx < fActiveLimit) {
2854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2855 UChar32 c = UTEXT_NEXT32(fInputText);
2856 if (c == opValue) {
2857 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2858 break;
2859 }
2860 } else {
2861 fHitEnd = TRUE;
2862 }
2863 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2864 break;
2867 case URX_STRING:
2868 {
2869 // Test input against a literal string.
2870 // Strings require two slots in the compiled pattern, one for the
2871 // offset to the string text, and one for the length.
2873 int32_t stringStartIdx = opValue;
2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2875 fp->fPatIdx++;
2876 opType = URX_TYPE(op);
2877 int32_t stringLen = URX_VAL(op);
2878 U_ASSERT(opType == URX_STRING_LEN);
2879 U_ASSERT(stringLen >= 2);
2881 const UChar *patternString = litText+stringStartIdx;
2882 int32_t patternStringIndex = 0;
2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2884 UChar32 inputChar;
2885 UChar32 patternChar;
2886 UBool success = TRUE;
2887 while (patternStringIndex < stringLen) {
2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2889 success = FALSE;
2890 fHitEnd = TRUE;
2891 break;
2892 }
2893 inputChar = UTEXT_NEXT32(fInputText);
2894 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2895 if (patternChar != inputChar) {
2896 success = FALSE;
2897 break;
2898 }
2899 }
2901 if (success) {
2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2903 } else {
2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2905 }
2906 }
2907 break;
2910 case URX_STATE_SAVE:
2911 fp = StateSave(fp, opValue, status);
2912 break;
2915 case URX_END:
2916 // The match loop will exit via this path on a successful match,
2917 // when we reach the end of the pattern.
2918 if (toEnd && fp->fInputIdx != fActiveLimit) {
2919 // The pattern matched, but not to the end of input. Try some more.
2920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2921 break;
2922 }
2923 isMatch = TRUE;
2924 goto breakFromLoop;
2926 // Start and End Capture stack frame variables are laid out out like this:
2927 // fp->fExtra[opValue] - The start of a completed capture group
2928 // opValue+1 - The end of a completed capture group
2929 // opValue+2 - the start of a capture group whose end
2930 // has not yet been reached (and might not ever be).
2931 case URX_START_CAPTURE:
2932 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2933 fp->fExtra[opValue+2] = fp->fInputIdx;
2934 break;
2937 case URX_END_CAPTURE:
2938 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2939 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2940 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2941 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2942 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2943 break;
2946 case URX_DOLLAR: // $, test for End of line
2947 // or for position before new line at end of input
2948 {
2949 if (fp->fInputIdx >= fAnchorLimit) {
2950 // We really are at the end of input. Success.
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
2953 break;
2954 }
2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2958 // If we are positioned just before a new-line that is located at the
2959 // end of input, succeed.
2960 UChar32 c = UTEXT_NEXT32(fInputText);
2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2963 // If not in the middle of a CR/LF sequence
2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2965 // At new-line at end of input. Success
2966 fHitEnd = TRUE;
2967 fRequireEnd = TRUE;
2969 break;
2970 }
2971 }
2972 } else {
2973 UChar32 nextC = UTEXT_NEXT32(fInputText);
2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2975 fHitEnd = TRUE;
2976 fRequireEnd = TRUE;
2977 break; // At CR/LF at end of input. Success
2978 }
2979 }
2981 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2982 }
2983 break;
2986 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2987 if (fp->fInputIdx >= fAnchorLimit) {
2988 // Off the end of input. Success.
2989 fHitEnd = TRUE;
2990 fRequireEnd = TRUE;
2991 break;
2992 } else {
2993 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2994 UChar32 c = UTEXT_NEXT32(fInputText);
2995 // Either at the last character of input, or off the end.
2996 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2997 fHitEnd = TRUE;
2998 fRequireEnd = TRUE;
2999 break;
3000 }
3001 }
3003 // Not at end of input. Back-track out.
3004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3005 break;
3008 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3009 {
3010 if (fp->fInputIdx >= fAnchorLimit) {
3011 // We really are at the end of input. Success.
3012 fHitEnd = TRUE;
3013 fRequireEnd = TRUE;
3014 break;
3015 }
3016 // If we are positioned just before a new-line, succeed.
3017 // It makes no difference where the new-line is within the input.
3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019 UChar32 c = UTEXT_CURRENT32(fInputText);
3020 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3021 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3022 // In multi-line mode, hitting a new-line just before the end of input does not
3023 // set the hitEnd or requireEnd flags
3024 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3025 break;
3026 }
3027 }
3028 // not at a new line. Fail.
3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030 }
3031 break;
3034 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3035 {
3036 if (fp->fInputIdx >= fAnchorLimit) {
3037 // We really are at the end of input. Success.
3038 fHitEnd = TRUE;
3039 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3040 break; // adding a new-line would not lose the match.
3041 }
3042 // If we are not positioned just before a new-line, the test fails; backtrack out.
3043 // It makes no difference where the new-line is within the input.
3044 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3045 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3046 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3047 }
3048 }
3049 break;
3052 case URX_CARET: // ^, test for start of line
3053 if (fp->fInputIdx != fAnchorStart) {
3054 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3055 }
3056 break;
3059 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3060 {
3061 if (fp->fInputIdx == fAnchorStart) {
3062 // We are at the start input. Success.
3063 break;
3064 }
3065 // Check whether character just before the current pos is a new-line
3066 // unless we are at the end of input
3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3068 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3069 if ((fp->fInputIdx < fAnchorLimit) &&
3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3071 // It's a new-line. ^ is true. Success.
3072 // TODO: what should be done with positions between a CR and LF?
3073 break;
3074 }
3075 // Not at the start of a line. Fail.
3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077 }
3078 break;
3081 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3082 {
3083 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3084 if (fp->fInputIdx <= fAnchorStart) {
3085 // We are at the start input. Success.
3086 break;
3087 }
3088 // Check whether character just before the current pos is a new-line
3089 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3090 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3091 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3092 if (c != 0x0a) {
3093 // Not at the start of a line. Back-track out.
3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3095 }
3096 }
3097 break;
3099 case URX_BACKSLASH_B: // Test for word boundaries
3100 {
3101 UBool success = isWordBoundary(fp->fInputIdx);
3102 success ^= (UBool)(opValue != 0); // flip sense for \B
3103 if (!success) {
3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105 }
3106 }
3107 break;
3110 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3111 {
3112 UBool success = isUWordBoundary(fp->fInputIdx);
3113 success ^= (UBool)(opValue != 0); // flip sense for \B
3114 if (!success) {
3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116 }
3117 }
3118 break;
3121 case URX_BACKSLASH_D: // Test for decimal digit
3122 {
3123 if (fp->fInputIdx >= fActiveLimit) {
3124 fHitEnd = TRUE;
3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126 break;
3127 }
3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3131 UChar32 c = UTEXT_NEXT32(fInputText);
3132 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3133 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3134 success ^= (UBool)(opValue != 0); // flip sense for \D
3135 if (success) {
3136 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3137 } else {
3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139 }
3140 }
3141 break;
3144 case URX_BACKSLASH_G: // Test for position at end of previous match
3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147 }
3148 break;
3151 case URX_BACKSLASH_X:
3152 // Match a Grapheme, as defined by Unicode TR 29.
3153 // Differs slightly from Perl, which consumes combining marks independently
3154 // of context.
3155 {
3157 // Fail if at end of input
3158 if (fp->fInputIdx >= fActiveLimit) {
3159 fHitEnd = TRUE;
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 break;
3162 }
3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3166 // Examine (and consume) the current char.
3167 // Dispatch into a little state machine, based on the char.
3168 UChar32 c;
3169 c = UTEXT_NEXT32(fInputText);
3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3171 UnicodeSet **sets = fPattern->fStaticSets;
3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3174 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3175 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3176 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3177 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3178 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3179 goto GC_Extend;
3183 GC_L:
3184 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3185 c = UTEXT_NEXT32(fInputText);
3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3188 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3189 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3190 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3191 (void)UTEXT_PREVIOUS32(fInputText);
3192 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3193 goto GC_Extend;
3195 GC_V:
3196 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3197 c = UTEXT_NEXT32(fInputText);
3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3200 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3201 (void)UTEXT_PREVIOUS32(fInputText);
3202 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3203 goto GC_Extend;
3205 GC_T:
3206 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3207 c = UTEXT_NEXT32(fInputText);
3208 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3210 (void)UTEXT_PREVIOUS32(fInputText);
3211 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3212 goto GC_Extend;
3214 GC_Extend:
3215 // Combining characters are consumed here
3216 for (;;) {
3217 if (fp->fInputIdx >= fActiveLimit) {
3218 break;
3219 }
3220 c = UTEXT_CURRENT32(fInputText);
3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3222 break;
3223 }
3224 (void)UTEXT_NEXT32(fInputText);
3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3226 }
3227 goto GC_Done;
3229 GC_Control:
3230 // Most control chars stand alone (don't combine with combining chars),
3231 // except for that CR/LF sequence is a single grapheme cluster.
3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3233 c = UTEXT_NEXT32(fInputText);
3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235 }
3237 GC_Done:
3238 if (fp->fInputIdx >= fActiveLimit) {
3239 fHitEnd = TRUE;
3240 }
3241 break;
3242 }
3247 case URX_BACKSLASH_Z: // Test for end of Input
3248 if (fp->fInputIdx < fAnchorLimit) {
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250 } else {
3251 fHitEnd = TRUE;
3252 fRequireEnd = TRUE;
3253 }
3254 break;
3258 case URX_STATIC_SETREF:
3259 {
3260 // Test input character against one of the predefined sets
3261 // (Word Characters, for example)
3262 // The high bit of the op value is a flag for the match polarity.
3263 // 0: success if input char is in set.
3264 // 1: success if input char is not in set.
3265 if (fp->fInputIdx >= fActiveLimit) {
3266 fHitEnd = TRUE;
3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3268 break;
3269 }
3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3272 opValue &= ~URX_NEG_SET;
3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3276 UChar32 c = UTEXT_NEXT32(fInputText);
3277 if (c < 256) {
3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3279 if (s8->contains(c)) {
3280 success = !success;
3281 }
3282 } else {
3283 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3284 if (s->contains(c)) {
3285 success = !success;
3286 }
3287 }
3288 if (success) {
3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3290 } else {
3291 // the character wasn't in the set.
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 }
3294 }
3295 break;
3298 case URX_STAT_SETREF_N:
3299 {
3300 // Test input character for NOT being a member of one of
3301 // the predefined sets (Word Characters, for example)
3302 if (fp->fInputIdx >= fActiveLimit) {
3303 fHitEnd = TRUE;
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305 break;
3306 }
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3312 UChar32 c = UTEXT_NEXT32(fInputText);
3313 if (c < 256) {
3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315 if (s8->contains(c) == FALSE) {
3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3317 break;
3318 }
3319 } else {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c) == FALSE) {
3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3323 break;
3324 }
3325 }
3326 // the character wasn't in the set.
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 }
3329 break;
3332 case URX_SETREF:
3333 if (fp->fInputIdx >= fActiveLimit) {
3334 fHitEnd = TRUE;
3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336 break;
3337 } else {
3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3340 // There is input left. Pick up one char and test it for set membership.
3341 UChar32 c = UTEXT_NEXT32(fInputText);
3342 U_ASSERT(opValue > 0 && opValue < sets->size());
3343 if (c<256) {
3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3345 if (s8->contains(c)) {
3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3347 break;
3348 }
3349 } else {
3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3351 if (s->contains(c)) {
3352 // The character is in the set. A Match.
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354 break;
3355 }
3356 }
3358 // the character wasn't in the set.
3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3360 }
3361 break;
3364 case URX_DOTANY:
3365 {
3366 // . matches anything, but stops at end-of-line.
3367 if (fp->fInputIdx >= fActiveLimit) {
3368 // At end of input. Match failed. Backtrack out.
3369 fHitEnd = TRUE;
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 break;
3372 }
3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3376 // There is input left. Advance over one char, unless we've hit end-of-line
3377 UChar32 c = UTEXT_NEXT32(fInputText);
3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3380 // End of line in normal mode. . does not match.
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break;
3383 }
3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3385 }
3386 break;
3389 case URX_DOTANY_ALL:
3390 {
3391 // ., in dot-matches-all (including new lines) mode
3392 if (fp->fInputIdx >= fActiveLimit) {
3393 // At end of input. Match failed. Backtrack out.
3394 fHitEnd = TRUE;
3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396 break;
3397 }
3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3401 // There is input left. Advance over one char, except if we are
3402 // at a cr/lf, advance over both of them.
3403 UChar32 c;
3404 c = UTEXT_NEXT32(fInputText);
3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3407 // In the case of a CR/LF, we need to advance over both.
3408 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3409 if (nextc == 0x0a) {
3410 (void)UTEXT_NEXT32(fInputText);
3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3412 }
3413 }
3414 }
3415 break;
3418 case URX_DOTANY_UNIX:
3419 {
3420 // '.' operator, matches all, but stops at end-of-line.
3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3422 if (fp->fInputIdx >= fActiveLimit) {
3423 // At end of input. Match failed. Backtrack out.
3424 fHitEnd = TRUE;
3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3426 break;
3427 }
3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3431 // There is input left. Advance over one char, unless we've hit end-of-line
3432 UChar32 c = UTEXT_NEXT32(fInputText);
3433 if (c == 0x0a) {
3434 // End of line in normal mode. '.' does not match the \n
3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3436 } else {
3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3438 }
3439 }
3440 break;
3443 case URX_JMP:
3444 fp->fPatIdx = opValue;
3445 break;
3447 case URX_FAIL:
3448 isMatch = FALSE;
3449 goto breakFromLoop;
3451 case URX_JMP_SAV:
3452 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3453 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3454 fp->fPatIdx = opValue; // Then JMP.
3455 break;
3457 case URX_JMP_SAV_X:
3458 // This opcode is used with (x)+, when x can match a zero length string.
3459 // Same as JMP_SAV, except conditional on the match having made forward progress.
3460 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3461 // data address of the input position at the start of the loop.
3462 {
3463 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3464 int32_t stoOp = (int32_t)pat[opValue-1];
3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3466 int32_t frameLoc = URX_VAL(stoOp);
3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3468 int64_t prevInputIdx = fp->fExtra[frameLoc];
3469 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3470 if (prevInputIdx < fp->fInputIdx) {
3471 // The match did make progress. Repeat the loop.
3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3473 fp->fPatIdx = opValue;
3474 fp->fExtra[frameLoc] = fp->fInputIdx;
3475 }
3476 // If the input position did not advance, we do nothing here,
3477 // execution will fall out of the loop.
3478 }
3479 break;
3481 case URX_CTR_INIT:
3482 {
3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3484 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3486 // Pick up the three extra operands that CTR_INIT has, and
3487 // skip the pattern location counter past
3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3489 fp->fPatIdx += 3;
3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3493 U_ASSERT(minCount>=0);
3494 U_ASSERT(maxCount>=minCount || maxCount==-1);
3495 U_ASSERT(loopLoc>=fp->fPatIdx);
3497 if (minCount == 0) {
3498 fp = StateSave(fp, loopLoc+1, status);
3499 }
3500 if (maxCount == -1) {
3501 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3502 } else if (maxCount == 0) {
3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3504 }
3505 }
3506 break;
3508 case URX_CTR_LOOP:
3509 {
3510 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3511 int32_t initOp = (int32_t)pat[opValue];
3512 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3513 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3514 int32_t minCount = (int32_t)pat[opValue+2];
3515 int32_t maxCount = (int32_t)pat[opValue+3];
3516 (*pCounter)++;
3517 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3518 U_ASSERT(*pCounter == maxCount);
3519 break;
3520 }
3521 if (*pCounter >= minCount) {
3522 if (maxCount == -1) {
3523 // Loop has no hard upper bound.
3524 // Check that it is progressing through the input, break if it is not.
3525 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3526 if (fp->fInputIdx == *pLastInputIdx) {
3527 break;
3528 } else {
3529 *pLastInputIdx = fp->fInputIdx;
3530 }
3531 }
3532 fp = StateSave(fp, fp->fPatIdx, status);
3533 }
3534 fp->fPatIdx = opValue + 4; // Loop back.
3535 }
3536 break;
3538 case URX_CTR_INIT_NG:
3539 {
3540 // Initialize a non-greedy loop
3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3542 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3544 // Pick up the three extra operands that CTR_INIT_NG has, and
3545 // skip the pattern location counter past
3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3547 fp->fPatIdx += 3;
3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3551 U_ASSERT(minCount>=0);
3552 U_ASSERT(maxCount>=minCount || maxCount==-1);
3553 U_ASSERT(loopLoc>fp->fPatIdx);
3554 if (maxCount == -1) {
3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3556 }
3558 if (minCount == 0) {
3559 if (maxCount != 0) {
3560 fp = StateSave(fp, fp->fPatIdx, status);
3561 }
3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3563 }
3564 }
3565 break;
3567 case URX_CTR_LOOP_NG:
3568 {
3569 // Non-greedy {min, max} loops
3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3571 int32_t initOp = (int32_t)pat[opValue];
3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3574 int32_t minCount = (int32_t)pat[opValue+2];
3575 int32_t maxCount = (int32_t)pat[opValue+3];
3577 (*pCounter)++;
3578 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3579 // The loop has matched the maximum permitted number of times.
3580 // Break out of here with no action. Matching will
3581 // continue with the following pattern.
3582 U_ASSERT(*pCounter == maxCount);
3583 break;
3584 }
3586 if (*pCounter < minCount) {
3587 // We haven't met the minimum number of matches yet.
3588 // Loop back for another one.
3589 fp->fPatIdx = opValue + 4; // Loop back.
3590 } else {
3591 // We do have the minimum number of matches.
3593 // If there is no upper bound on the loop iterations, check that the input index
3594 // is progressing, and stop the loop if it is not.
3595 if (maxCount == -1) {
3596 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3597 if (fp->fInputIdx == *pLastInputIdx) {
3598 break;
3599 }
3600 *pLastInputIdx = fp->fInputIdx;
3601 }
3603 // Loop Continuation: we will fall into the pattern following the loop
3604 // (non-greedy, don't execute loop body first), but first do
3605 // a state save to the top of the loop, so that a match failure
3606 // in the following pattern will try another iteration of the loop.
3607 fp = StateSave(fp, opValue + 4, status);
3608 }
3609 }
3610 break;
3612 case URX_STO_SP:
3613 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3614 fData[opValue] = fStack->size();
3615 break;
3617 case URX_LD_SP:
3618 {
3619 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3620 int32_t newStackSize = (int32_t)fData[opValue];
3621 U_ASSERT(newStackSize <= fStack->size());
3622 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3623 if (newFP == (int64_t *)fp) {
3624 break;
3625 }
3626 int32_t i;
3627 for (i=0; i<fFrameSize; i++) {
3628 newFP[i] = ((int64_t *)fp)[i];
3629 }
3630 fp = (REStackFrame *)newFP;
3631 fStack->setSize(newStackSize);
3632 }
3633 break;
3635 case URX_BACKREF:
3636 {
3637 U_ASSERT(opValue < fFrameSize);
3638 int64_t groupStartIdx = fp->fExtra[opValue];
3639 int64_t groupEndIdx = fp->fExtra[opValue+1];
3640 U_ASSERT(groupStartIdx <= groupEndIdx);
3641 if (groupStartIdx < 0) {
3642 // This capture group has not participated in the match thus far,
3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3644 break;
3645 }
3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3649 // Note: if the capture group match was of an empty string the backref
3650 // match succeeds. Verified by testing: Perl matches succeed
3651 // in this case, so we do too.
3653 UBool success = TRUE;
3654 for (;;) {
3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3656 success = TRUE;
3657 break;
3658 }
3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3660 success = FALSE;
3661 fHitEnd = TRUE;
3662 break;
3663 }
3664 UChar32 captureGroupChar = utext_next32(fAltInputText);
3665 UChar32 inputChar = utext_next32(fInputText);
3666 if (inputChar != captureGroupChar) {
3667 success = FALSE;
3668 break;
3669 }
3670 }
3672 if (success) {
3673 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3674 } else {
3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3676 }
3677 }
3678 break;
3682 case URX_BACKREF_I:
3683 {
3684 U_ASSERT(opValue < fFrameSize);
3685 int64_t groupStartIdx = fp->fExtra[opValue];
3686 int64_t groupEndIdx = fp->fExtra[opValue+1];
3687 U_ASSERT(groupStartIdx <= groupEndIdx);
3688 if (groupStartIdx < 0) {
3689 // This capture group has not participated in the match thus far,
3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3691 break;
3692 }
3693 utext_setNativeIndex(fAltInputText, groupStartIdx);
3694 utext_setNativeIndex(fInputText, fp->fInputIdx);
3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3696 CaseFoldingUTextIterator inputItr(*fInputText);
3698 // Note: if the capture group match was of an empty string the backref
3699 // match succeeds. Verified by testing: Perl matches succeed
3700 // in this case, so we do too.
3702 UBool success = TRUE;
3703 for (;;) {
3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3705 success = TRUE;
3706 break;
3707 }
3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3709 success = FALSE;
3710 fHitEnd = TRUE;
3711 break;
3712 }
3713 UChar32 captureGroupChar = captureGroupItr.next();
3714 UChar32 inputChar = inputItr.next();
3715 if (inputChar != captureGroupChar) {
3716 success = FALSE;
3717 break;
3718 }
3719 }
3721 if (success && inputItr.inExpansion()) {
3722 // We otained a match by consuming part of a string obtained from
3723 // case-folding a single code point of the input text.
3724 // This does not count as an overall match.
3725 success = FALSE;
3726 }
3728 if (success) {
3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3730 } else {
3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3732 }
3734 }
3735 break;
3737 case URX_STO_INP_LOC:
3738 {
3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3740 fp->fExtra[opValue] = fp->fInputIdx;
3741 }
3742 break;
3744 case URX_JMPX:
3745 {
3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3747 fp->fPatIdx += 1;
3748 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3749 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3750 int64_t savedInputIdx = fp->fExtra[dataLoc];
3751 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3752 if (savedInputIdx < fp->fInputIdx) {
3753 fp->fPatIdx = opValue; // JMP
3754 } else {
3755 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3756 }
3757 }
3758 break;
3760 case URX_LA_START:
3761 {
3762 // Entering a lookahead block.
3763 // Save Stack Ptr, Input Pos.
3764 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3765 fData[opValue] = fStack->size();
3766 fData[opValue+1] = fp->fInputIdx;
3767 fActiveStart = fLookStart; // Set the match region change for
3768 fActiveLimit = fLookLimit; // transparent bounds.
3769 }
3770 break;
3772 case URX_LA_END:
3773 {
3774 // Leaving a look-ahead block.
3775 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3776 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3777 int32_t stackSize = fStack->size();
3778 int32_t newStackSize =(int32_t)fData[opValue];
3779 U_ASSERT(stackSize >= newStackSize);
3780 if (stackSize > newStackSize) {
3781 // Copy the current top frame back to the new (cut back) top frame.
3782 // This makes the capture groups from within the look-ahead
3783 // expression available.
3784 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3785 int32_t i;
3786 for (i=0; i<fFrameSize; i++) {
3787 newFP[i] = ((int64_t *)fp)[i];
3788 }
3789 fp = (REStackFrame *)newFP;
3790 fStack->setSize(newStackSize);
3791 }
3792 fp->fInputIdx = fData[opValue+1];
3794 // Restore the active region bounds in the input string; they may have
3795 // been changed because of transparent bounds on a Region.
3796 fActiveStart = fRegionStart;
3797 fActiveLimit = fRegionLimit;
3798 }
3799 break;
3801 case URX_ONECHAR_I:
3802 // Case insensitive one char. The char from the pattern is already case folded.
3803 // Input text is not, but case folding the input can not reduce two or more code
3804 // points to one.
3805 if (fp->fInputIdx < fActiveLimit) {
3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3808 UChar32 c = UTEXT_NEXT32(fInputText);
3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3811 break;
3812 }
3813 } else {
3814 fHitEnd = TRUE;
3815 }
3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3818 break;
3820 case URX_STRING_I:
3821 {
3822 // Case-insensitive test input against a literal string.
3823 // Strings require two slots in the compiled pattern, one for the
3824 // offset to the string text, and one for the length.
3825 // The compiled string has already been case folded.
3826 {
3827 const UChar *patternString = litText + opValue;
3828 int32_t patternStringIdx = 0;
3830 op = (int32_t)pat[fp->fPatIdx];
3831 fp->fPatIdx++;
3832 opType = URX_TYPE(op);
3833 opValue = URX_VAL(op);
3834 U_ASSERT(opType == URX_STRING_LEN);
3835 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3838 UChar32 cPattern;
3839 UChar32 cText;
3840 UBool success = TRUE;
3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3843 CaseFoldingUTextIterator inputIterator(*fInputText);
3844 while (patternStringIdx < patternStringLen) {
3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3846 success = FALSE;
3847 fHitEnd = TRUE;
3848 break;
3849 }
3850 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3851 cText = inputIterator.next();
3852 if (cText != cPattern) {
3853 success = FALSE;
3854 break;
3855 }
3856 }
3857 if (inputIterator.inExpansion()) {
3858 success = FALSE;
3859 }
3861 if (success) {
3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3863 } else {
3864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3865 }
3866 }
3867 }
3868 break;
3870 case URX_LB_START:
3871 {
3872 // Entering a look-behind block.
3873 // Save Stack Ptr, Input Pos.
3874 // TODO: implement transparent bounds. Ticket #6067
3875 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3876 fData[opValue] = fStack->size();
3877 fData[opValue+1] = fp->fInputIdx;
3878 // Init the variable containing the start index for attempted matches.
3879 fData[opValue+2] = -1;
3880 // Save input string length, then reset to pin any matches to end at
3881 // the current position.
3882 fData[opValue+3] = fActiveLimit;
3883 fActiveLimit = fp->fInputIdx;
3884 }
3885 break;
3888 case URX_LB_CONT:
3889 {
3890 // Positive Look-Behind, at top of loop checking for matches of LB expression
3891 // at all possible input starting positions.
3893 // Fetch the min and max possible match lengths. They are the operands
3894 // of this op in the pattern.
3895 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3896 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3897 U_ASSERT(minML <= maxML);
3898 U_ASSERT(minML >= 0);
3900 // Fetch (from data) the last input index where a match was attempted.
3901 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3902 int64_t *lbStartIdx = &fData[opValue+2];
3903 if (*lbStartIdx < 0) {
3904 // First time through loop.
3905 *lbStartIdx = fp->fInputIdx - minML;
3906 } else {
3907 // 2nd through nth time through the loop.
3908 // Back up start position for match by one.
3909 if (*lbStartIdx == 0) {
3910 (*lbStartIdx)--;
3911 } else {
3912 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3913 (void)UTEXT_PREVIOUS32(fInputText);
3914 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3915 }
3916 }
3918 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3919 // We have tried all potential match starting points without
3920 // getting a match. Backtrack out, and out of the
3921 // Look Behind altogether.
3922 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3923 int64_t restoreInputLen = fData[opValue+3];
3924 U_ASSERT(restoreInputLen >= fActiveLimit);
3925 U_ASSERT(restoreInputLen <= fInputLength);
3926 fActiveLimit = restoreInputLen;
3927 break;
3928 }
3930 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3931 // (successful match will fall off the end of the loop.)
3932 fp = StateSave(fp, fp->fPatIdx-3, status);
3933 fp->fInputIdx = *lbStartIdx;
3934 }
3935 break;
3937 case URX_LB_END:
3938 // End of a look-behind block, after a successful match.
3939 {
3940 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3941 if (fp->fInputIdx != fActiveLimit) {
3942 // The look-behind expression matched, but the match did not
3943 // extend all the way to the point that we are looking behind from.
3944 // FAIL out of here, which will take us back to the LB_CONT, which
3945 // will retry the match starting at another position or fail
3946 // the look-behind altogether, whichever is appropriate.
3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3948 break;
3949 }
3951 // Look-behind match is good. Restore the orignal input string length,
3952 // which had been truncated to pin the end of the lookbehind match to the
3953 // position being looked-behind.
3954 int64_t originalInputLen = fData[opValue+3];
3955 U_ASSERT(originalInputLen >= fActiveLimit);
3956 U_ASSERT(originalInputLen <= fInputLength);
3957 fActiveLimit = originalInputLen;
3958 }
3959 break;
3962 case URX_LBN_CONT:
3963 {
3964 // Negative Look-Behind, at top of loop checking for matches of LB expression
3965 // at all possible input starting positions.
3967 // Fetch the extra parameters of this op.
3968 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3969 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3970 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3971 continueLoc = URX_VAL(continueLoc);
3972 U_ASSERT(minML <= maxML);
3973 U_ASSERT(minML >= 0);
3974 U_ASSERT(continueLoc > fp->fPatIdx);
3976 // Fetch (from data) the last input index where a match was attempted.
3977 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3978 int64_t *lbStartIdx = &fData[opValue+2];
3979 if (*lbStartIdx < 0) {
3980 // First time through loop.
3981 *lbStartIdx = fp->fInputIdx - minML;
3982 } else {
3983 // 2nd through nth time through the loop.
3984 // Back up start position for match by one.
3985 if (*lbStartIdx == 0) {
3986 (*lbStartIdx)--;
3987 } else {
3988 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3989 (void)UTEXT_PREVIOUS32(fInputText);
3990 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3991 }
3992 }
3994 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3995 // We have tried all potential match starting points without
3996 // getting a match, which means that the negative lookbehind as
3997 // a whole has succeeded. Jump forward to the continue location
3998 int64_t restoreInputLen = fData[opValue+3];
3999 U_ASSERT(restoreInputLen >= fActiveLimit);
4000 U_ASSERT(restoreInputLen <= fInputLength);
4001 fActiveLimit = restoreInputLen;
4002 fp->fPatIdx = continueLoc;
4003 break;
4004 }
4006 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4007 // (successful match will cause a FAIL out of the loop altogether.)
4008 fp = StateSave(fp, fp->fPatIdx-4, status);
4009 fp->fInputIdx = *lbStartIdx;
4010 }
4011 break;
4013 case URX_LBN_END:
4014 // End of a negative look-behind block, after a successful match.
4015 {
4016 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4017 if (fp->fInputIdx != fActiveLimit) {
4018 // The look-behind expression matched, but the match did not
4019 // extend all the way to the point that we are looking behind from.
4020 // FAIL out of here, which will take us back to the LB_CONT, which
4021 // will retry the match starting at another position or succeed
4022 // the look-behind altogether, whichever is appropriate.
4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4024 break;
4025 }
4027 // Look-behind expression matched, which means look-behind test as
4028 // a whole Fails
4030 // Restore the orignal input string length, which had been truncated
4031 // inorder to pin the end of the lookbehind match
4032 // to the position being looked-behind.
4033 int64_t originalInputLen = fData[opValue+3];
4034 U_ASSERT(originalInputLen >= fActiveLimit);
4035 U_ASSERT(originalInputLen <= fInputLength);
4036 fActiveLimit = originalInputLen;
4038 // Restore original stack position, discarding any state saved
4039 // by the successful pattern match.
4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4041 int32_t newStackSize = (int32_t)fData[opValue];
4042 U_ASSERT(fStack->size() > newStackSize);
4043 fStack->setSize(newStackSize);
4045 // FAIL, which will take control back to someplace
4046 // prior to entering the look-behind test.
4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4048 }
4049 break;
4052 case URX_LOOP_SR_I:
4053 // Loop Initialization for the optimized implementation of
4054 // [some character set]*
4055 // This op scans through all matching input.
4056 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4057 {
4058 U_ASSERT(opValue > 0 && opValue < sets->size());
4059 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4060 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4062 // Loop through input, until either the input is exhausted or
4063 // we reach a character that is not a member of the set.
4064 int64_t ix = fp->fInputIdx;
4065 UTEXT_SETNATIVEINDEX(fInputText, ix);
4066 for (;;) {
4067 if (ix >= fActiveLimit) {
4068 fHitEnd = TRUE;
4069 break;
4070 }
4071 UChar32 c = UTEXT_NEXT32(fInputText);
4072 if (c<256) {
4073 if (s8->contains(c) == FALSE) {
4074 break;
4075 }
4076 } else {
4077 if (s->contains(c) == FALSE) {
4078 break;
4079 }
4080 }
4081 ix = UTEXT_GETNATIVEINDEX(fInputText);
4082 }
4084 // If there were no matching characters, skip over the loop altogether.
4085 // The loop doesn't run at all, a * op always succeeds.
4086 if (ix == fp->fInputIdx) {
4087 fp->fPatIdx++; // skip the URX_LOOP_C op.
4088 break;
4089 }
4091 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4092 // must follow. It's operand is the stack location
4093 // that holds the starting input index for the match of this [set]*
4094 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4095 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4096 int32_t stackLoc = URX_VAL(loopcOp);
4097 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4098 fp->fExtra[stackLoc] = fp->fInputIdx;
4099 fp->fInputIdx = ix;
4101 // Save State to the URX_LOOP_C op that follows this one,
4102 // so that match failures in the following code will return to there.
4103 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4104 fp = StateSave(fp, fp->fPatIdx, status);
4105 fp->fPatIdx++;
4106 }
4107 break;
4110 case URX_LOOP_DOT_I:
4111 // Loop Initialization for the optimized implementation of .*
4112 // This op scans through all remaining input.
4113 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4114 {
4115 // Loop through input until the input is exhausted (we reach an end-of-line)
4116 // In DOTALL mode, we can just go straight to the end of the input.
4117 int64_t ix;
4118 if ((opValue & 1) == 1) {
4119 // Dot-matches-All mode. Jump straight to the end of the string.
4120 ix = fActiveLimit;
4121 fHitEnd = TRUE;
4122 } else {
4123 // NOT DOT ALL mode. Line endings do not match '.'
4124 // Scan forward until a line ending or end of input.
4125 ix = fp->fInputIdx;
4126 UTEXT_SETNATIVEINDEX(fInputText, ix);
4127 for (;;) {
4128 if (ix >= fActiveLimit) {
4129 fHitEnd = TRUE;
4130 break;
4131 }
4132 UChar32 c = UTEXT_NEXT32(fInputText);
4133 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4134 if ((c == 0x0a) || // 0x0a is newline in both modes.
4135 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4136 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4137 // char is a line ending. Exit the scanning loop.
4138 break;
4139 }
4140 }
4141 ix = UTEXT_GETNATIVEINDEX(fInputText);
4142 }
4143 }
4145 // If there were no matching characters, skip over the loop altogether.
4146 // The loop doesn't run at all, a * op always succeeds.
4147 if (ix == fp->fInputIdx) {
4148 fp->fPatIdx++; // skip the URX_LOOP_C op.
4149 break;
4150 }
4152 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4153 // must follow. It's operand is the stack location
4154 // that holds the starting input index for the match of this .*
4155 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4156 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4157 int32_t stackLoc = URX_VAL(loopcOp);
4158 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4159 fp->fExtra[stackLoc] = fp->fInputIdx;
4160 fp->fInputIdx = ix;
4162 // Save State to the URX_LOOP_C op that follows this one,
4163 // so that match failures in the following code will return to there.
4164 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4165 fp = StateSave(fp, fp->fPatIdx, status);
4166 fp->fPatIdx++;
4167 }
4168 break;
4171 case URX_LOOP_C:
4172 {
4173 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4174 backSearchIndex = fp->fExtra[opValue];
4175 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4176 if (backSearchIndex == fp->fInputIdx) {
4177 // We've backed up the input idx to the point that the loop started.
4178 // The loop is done. Leave here without saving state.
4179 // Subsequent failures won't come back here.
4180 break;
4181 }
4182 // Set up for the next iteration of the loop, with input index
4183 // backed up by one from the last time through,
4184 // and a state save to this instruction in case the following code fails again.
4185 // (We're going backwards because this loop emulates stack unwinding, not
4186 // the initial scan forward.)
4187 U_ASSERT(fp->fInputIdx > 0);
4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4193 if (prevC == 0x0a &&
4194 fp->fInputIdx > backSearchIndex &&
4195 twoPrevC == 0x0d) {
4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4198 // .*, stepping back over CRLF pair.
4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4200 }
4201 }
4204 fp = StateSave(fp, fp->fPatIdx-1, status);
4205 }
4206 break;
4210 default:
4211 // Trouble. The compiled pattern contains an entry with an
4212 // unrecognized type tag.
4213 U_ASSERT(FALSE);
4214 }
4216 if (U_FAILURE(status)) {
4217 isMatch = FALSE;
4218 break;
4219 }
4220 }
4222 breakFromLoop:
4223 fMatch = isMatch;
4224 if (isMatch) {
4225 fLastMatchEnd = fMatchEnd;
4226 fMatchStart = startIdx;
4227 fMatchEnd = fp->fInputIdx;
4228 if (fTraceDebug) {
4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
4230 }
4231 }
4232 else
4233 {
4234 if (fTraceDebug) {
4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4236 }
4237 }
4239 fFrame = fp; // The active stack frame when the engine stopped.
4240 // Contains the capture group results that we need to
4241 // access later.
4242 return;
4243 }
4246 //--------------------------------------------------------------------------------
4247 //
4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4249 // assumption that the entire string is available in the UText's
4250 // chunk buffer. For now, that means we can use int32_t indexes,
4251 // except for anything that needs to be saved (like group starts
4252 // and ends).
4253 //
4254 // startIdx: begin matching a this index.
4255 // toEnd: if true, match must extend to end of the input region
4256 //
4257 //--------------------------------------------------------------------------------
4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4259 UBool isMatch = FALSE; // True if the we have a match.
4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4263 int32_t op; // Operation from the compiled pattern, split into
4264 int32_t opType; // the opcode
4265 int32_t opValue; // and the operand value.
4267 #ifdef REGEX_RUN_DEBUG
4268 if (fTraceDebug)
4269 {
4270 printf("MatchAt(startIdx=%d)\n", startIdx);
4271 printf("Original Pattern: ");
4272 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4273 while (c != U_SENTINEL) {
4274 if (c<32 || c>256) {
4275 c = '.';
4276 }
4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4279 c = UTEXT_NEXT32(fPattern->fPattern);
4280 }
4281 printf("\n");
4282 printf("Input String: ");
4283 c = utext_next32From(fInputText, 0);
4284 while (c != U_SENTINEL) {
4285 if (c<32 || c>256) {
4286 c = '.';
4287 }
4288 printf("%c", c);
4290 c = UTEXT_NEXT32(fInputText);
4291 }
4292 printf("\n");
4293 printf("\n");
4294 }
4295 #endif
4297 if (U_FAILURE(status)) {
4298 return;
4299 }
4301 // Cache frequently referenced items from the compiled pattern
4302 //
4303 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4305 const UChar *litText = fPattern->fLiteralText.getBuffer();
4306 UVector *sets = fPattern->fSets;
4308 const UChar *inputBuf = fInputText->chunkContents;
4310 fFrameSize = fPattern->fFrameSize;
4311 REStackFrame *fp = resetStack();
4313 fp->fPatIdx = 0;
4314 fp->fInputIdx = startIdx;
4316 // Zero out the pattern's static data
4317 int32_t i;
4318 for (i = 0; i<fPattern->fDataSize; i++) {
4319 fData[i] = 0;
4320 }
4322 //
4323 // Main loop for interpreting the compiled pattern.
4324 // One iteration of the loop per pattern operation performed.
4325 //
4326 for (;;) {
4327 #if 0
4328 if (_heapchk() != _HEAPOK) {
4329 fprintf(stderr, "Heap Trouble\n");
4330 }
4331 #endif
4333 op = (int32_t)pat[fp->fPatIdx];
4334 opType = URX_TYPE(op);
4335 opValue = URX_VAL(op);
4336 #ifdef REGEX_RUN_DEBUG
4337 if (fTraceDebug) {
4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4341 fPattern->dumpOp(fp->fPatIdx);
4342 }
4343 #endif
4344 fp->fPatIdx++;
4346 switch (opType) {
4349 case URX_NOP:
4350 break;
4353 case URX_BACKTRACK:
4354 // Force a backtrack. In some circumstances, the pattern compiler
4355 // will notice that the pattern can't possibly match anything, and will
4356 // emit one of these at that point.
4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4358 break;
4361 case URX_ONECHAR:
4362 if (fp->fInputIdx < fActiveLimit) {
4363 UChar32 c;
4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4365 if (c == opValue) {
4366 break;
4367 }
4368 } else {
4369 fHitEnd = TRUE;
4370 }
4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4372 break;
4375 case URX_STRING:
4376 {
4377 // Test input against a literal string.
4378 // Strings require two slots in the compiled pattern, one for the
4379 // offset to the string text, and one for the length.
4380 int32_t stringStartIdx = opValue;
4381 int32_t stringLen;
4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4384 fp->fPatIdx++;
4385 opType = URX_TYPE(op);
4386 stringLen = URX_VAL(op);
4387 U_ASSERT(opType == URX_STRING_LEN);
4388 U_ASSERT(stringLen >= 2);
4390 const UChar * pInp = inputBuf + fp->fInputIdx;
4391 const UChar * pInpLimit = inputBuf + fActiveLimit;
4392 const UChar * pPat = litText+stringStartIdx;
4393 const UChar * pEnd = pInp + stringLen;
4394 UBool success = TRUE;
4395 while (pInp < pEnd) {
4396 if (pInp >= pInpLimit) {
4397 fHitEnd = TRUE;
4398 success = FALSE;
4399 break;
4400 }
4401 if (*pInp++ != *pPat++) {
4402 success = FALSE;
4403 break;
4404 }
4405 }
4407 if (success) {
4408 fp->fInputIdx += stringLen;
4409 } else {
4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4411 }
4412 }
4413 break;
4416 case URX_STATE_SAVE:
4417 fp = StateSave(fp, opValue, status);
4418 break;
4421 case URX_END:
4422 // The match loop will exit via this path on a successful match,
4423 // when we reach the end of the pattern.
4424 if (toEnd && fp->fInputIdx != fActiveLimit) {
4425 // The pattern matched, but not to the end of input. Try some more.
4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4427 break;
4428 }
4429 isMatch = TRUE;
4430 goto breakFromLoop;
4432 // Start and End Capture stack frame variables are laid out out like this:
4433 // fp->fExtra[opValue] - The start of a completed capture group
4434 // opValue+1 - The end of a completed capture group
4435 // opValue+2 - the start of a capture group whose end
4436 // has not yet been reached (and might not ever be).
4437 case URX_START_CAPTURE:
4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4439 fp->fExtra[opValue+2] = fp->fInputIdx;
4440 break;
4443 case URX_END_CAPTURE:
4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4449 break;
4452 case URX_DOLLAR: // $, test for End of line
4453 // or for position before new line at end of input
4454 if (fp->fInputIdx < fAnchorLimit-2) {
4455 // We are no where near the end of input. Fail.
4456 // This is the common case. Keep it first.
4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4458 break;
4459 }
4460 if (fp->fInputIdx >= fAnchorLimit) {
4461 // We really are at the end of input. Success.
4462 fHitEnd = TRUE;
4463 fRequireEnd = TRUE;
4464 break;
4465 }
4467 // If we are positioned just before a new-line that is located at the
4468 // end of input, succeed.
4469 if (fp->fInputIdx == fAnchorLimit-1) {
4470 UChar32 c;
4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4475 // At new-line at end of input. Success
4476 fHitEnd = TRUE;
4477 fRequireEnd = TRUE;
4478 break;
4479 }
4480 }
4481 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4483 fHitEnd = TRUE;
4484 fRequireEnd = TRUE;
4485 break; // At CR/LF at end of input. Success
4486 }
4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4490 break;
4493 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4494 if (fp->fInputIdx >= fAnchorLimit-1) {
4495 // Either at the last character of input, or off the end.
4496 if (fp->fInputIdx == fAnchorLimit-1) {
4497 // At last char of input. Success if it's a new line.
4498 if (inputBuf[fp->fInputIdx] == 0x0a) {
4499 fHitEnd = TRUE;
4500 fRequireEnd = TRUE;
4501 break;
4502 }
4503 } else {
4504 // Off the end of input. Success.
4505 fHitEnd = TRUE;
4506 fRequireEnd = TRUE;
4507 break;
4508 }
4509 }
4511 // Not at end of input. Back-track out.
4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4513 break;
4516 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4517 {
4518 if (fp->fInputIdx >= fAnchorLimit) {
4519 // We really are at the end of input. Success.
4520 fHitEnd = TRUE;
4521 fRequireEnd = TRUE;
4522 break;
4523 }
4524 // If we are positioned just before a new-line, succeed.
4525 // It makes no difference where the new-line is within the input.
4526 UChar32 c = inputBuf[fp->fInputIdx];
4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4528 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4529 // In multi-line mode, hitting a new-line just before the end of input does not
4530 // set the hitEnd or requireEnd flags
4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4532 break;
4533 }
4534 }
4535 // not at a new line. Fail.
4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4537 }
4538 break;
4541 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4542 {
4543 if (fp->fInputIdx >= fAnchorLimit) {
4544 // We really are at the end of input. Success.
4545 fHitEnd = TRUE;
4546 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4547 break; // adding a new-line would not lose the match.
4548 }
4549 // If we are not positioned just before a new-line, the test fails; backtrack out.
4550 // It makes no difference where the new-line is within the input.
4551 if (inputBuf[fp->fInputIdx] != 0x0a) {
4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4553 }
4554 }
4555 break;
4558 case URX_CARET: // ^, test for start of line
4559 if (fp->fInputIdx != fAnchorStart) {
4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4561 }
4562 break;
4565 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4566 {
4567 if (fp->fInputIdx == fAnchorStart) {
4568 // We are at the start input. Success.
4569 break;
4570 }
4571 // Check whether character just before the current pos is a new-line
4572 // unless we are at the end of input
4573 UChar c = inputBuf[fp->fInputIdx - 1];
4574 if ((fp->fInputIdx < fAnchorLimit) &&
4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4576 // It's a new-line. ^ is true. Success.
4577 // TODO: what should be done with positions between a CR and LF?
4578 break;
4579 }
4580 // Not at the start of a line. Fail.
4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4582 }
4583 break;
4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4587 {
4588 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4589 if (fp->fInputIdx <= fAnchorStart) {
4590 // We are at the start input. Success.
4591 break;
4592 }
4593 // Check whether character just before the current pos is a new-line
4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4595 UChar c = inputBuf[fp->fInputIdx - 1];
4596 if (c != 0x0a) {
4597 // Not at the start of a line. Back-track out.
4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4599 }
4600 }
4601 break;
4603 case URX_BACKSLASH_B: // Test for word boundaries
4604 {
4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4606 success ^= (UBool)(opValue != 0); // flip sense for \B
4607 if (!success) {
4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4609 }
4610 }
4611 break;
4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4615 {
4616 UBool success = isUWordBoundary(fp->fInputIdx);
4617 success ^= (UBool)(opValue != 0); // flip sense for \B
4618 if (!success) {
4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4620 }
4621 }
4622 break;
4625 case URX_BACKSLASH_D: // Test for decimal digit
4626 {
4627 if (fp->fInputIdx >= fActiveLimit) {
4628 fHitEnd = TRUE;
4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4630 break;
4631 }
4633 UChar32 c;
4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4635 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4637 success ^= (UBool)(opValue != 0); // flip sense for \D
4638 if (!success) {
4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4640 }
4641 }
4642 break;
4645 case URX_BACKSLASH_G: // Test for position at end of previous match
4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648 }
4649 break;
4652 case URX_BACKSLASH_X:
4653 // Match a Grapheme, as defined by Unicode TR 29.
4654 // Differs slightly from Perl, which consumes combining marks independently
4655 // of context.
4656 {
4658 // Fail if at end of input
4659 if (fp->fInputIdx >= fActiveLimit) {
4660 fHitEnd = TRUE;
4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4662 break;
4663 }
4665 // Examine (and consume) the current char.
4666 // Dispatch into a little state machine, based on the char.
4667 UChar32 c;
4668 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4669 UnicodeSet **sets = fPattern->fStaticSets;
4670 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4671 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4672 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4673 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4674 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4675 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4676 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4677 goto GC_Extend;
4681 GC_L:
4682 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4683 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4684 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4685 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4686 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4687 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4688 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4689 goto GC_Extend;
4691 GC_V:
4692 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4693 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4694 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4695 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4696 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4697 goto GC_Extend;
4699 GC_T:
4700 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4701 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4702 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4703 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4704 goto GC_Extend;
4706 GC_Extend:
4707 // Combining characters are consumed here
4708 for (;;) {
4709 if (fp->fInputIdx >= fActiveLimit) {
4710 break;
4711 }
4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4715 break;
4716 }
4717 }
4718 goto GC_Done;
4720 GC_Control:
4721 // Most control chars stand alone (don't combine with combining chars),
4722 // except for that CR/LF sequence is a single grapheme cluster.
4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4724 fp->fInputIdx++;
4725 }
4727 GC_Done:
4728 if (fp->fInputIdx >= fActiveLimit) {
4729 fHitEnd = TRUE;
4730 }
4731 break;
4732 }
4737 case URX_BACKSLASH_Z: // Test for end of Input
4738 if (fp->fInputIdx < fAnchorLimit) {
4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4740 } else {
4741 fHitEnd = TRUE;
4742 fRequireEnd = TRUE;
4743 }
4744 break;
4748 case URX_STATIC_SETREF:
4749 {
4750 // Test input character against one of the predefined sets
4751 // (Word Characters, for example)
4752 // The high bit of the op value is a flag for the match polarity.
4753 // 0: success if input char is in set.
4754 // 1: success if input char is not in set.
4755 if (fp->fInputIdx >= fActiveLimit) {
4756 fHitEnd = TRUE;
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758 break;
4759 }
4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4762 opValue &= ~URX_NEG_SET;
4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4765 UChar32 c;
4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4767 if (c < 256) {
4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4769 if (s8->contains(c)) {
4770 success = !success;
4771 }
4772 } else {
4773 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4774 if (s->contains(c)) {
4775 success = !success;
4776 }
4777 }
4778 if (!success) {
4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4780 }
4781 }
4782 break;
4785 case URX_STAT_SETREF_N:
4786 {
4787 // Test input character for NOT being a member of one of
4788 // the predefined sets (Word Characters, for example)
4789 if (fp->fInputIdx >= fActiveLimit) {
4790 fHitEnd = TRUE;
4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4792 break;
4793 }
4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4797 UChar32 c;
4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4799 if (c < 256) {
4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4801 if (s8->contains(c) == FALSE) {
4802 break;
4803 }
4804 } else {
4805 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4806 if (s->contains(c) == FALSE) {
4807 break;
4808 }
4809 }
4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4811 }
4812 break;
4815 case URX_SETREF:
4816 {
4817 if (fp->fInputIdx >= fActiveLimit) {
4818 fHitEnd = TRUE;
4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4820 break;
4821 }
4823 U_ASSERT(opValue > 0 && opValue < sets->size());
4825 // There is input left. Pick up one char and test it for set membership.
4826 UChar32 c;
4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4828 if (c<256) {
4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4830 if (s8->contains(c)) {
4831 // The character is in the set. A Match.
4832 break;
4833 }
4834 } else {
4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4836 if (s->contains(c)) {
4837 // The character is in the set. A Match.
4838 break;
4839 }
4840 }
4842 // the character wasn't in the set.
4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844 }
4845 break;
4848 case URX_DOTANY:
4849 {
4850 // . matches anything, but stops at end-of-line.
4851 if (fp->fInputIdx >= fActiveLimit) {
4852 // At end of input. Match failed. Backtrack out.
4853 fHitEnd = TRUE;
4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4855 break;
4856 }
4858 // There is input left. Advance over one char, unless we've hit end-of-line
4859 UChar32 c;
4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4863 // End of line in normal mode. . does not match.
4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865 break;
4866 }
4867 }
4868 break;
4871 case URX_DOTANY_ALL:
4872 {
4873 // . in dot-matches-all (including new lines) mode
4874 if (fp->fInputIdx >= fActiveLimit) {
4875 // At end of input. Match failed. Backtrack out.
4876 fHitEnd = TRUE;
4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4878 break;
4879 }
4881 // There is input left. Advance over one char, except if we are
4882 // at a cr/lf, advance over both of them.
4883 UChar32 c;
4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4886 // In the case of a CR/LF, we need to advance over both.
4887 if (inputBuf[fp->fInputIdx] == 0x0a) {
4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4889 }
4890 }
4891 }
4892 break;
4895 case URX_DOTANY_UNIX:
4896 {
4897 // '.' operator, matches all, but stops at end-of-line.
4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4899 if (fp->fInputIdx >= fActiveLimit) {
4900 // At end of input. Match failed. Backtrack out.
4901 fHitEnd = TRUE;
4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4903 break;
4904 }
4906 // There is input left. Advance over one char, unless we've hit end-of-line
4907 UChar32 c;
4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4909 if (c == 0x0a) {
4910 // End of line in normal mode. '.' does not match the \n
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912 }
4913 }
4914 break;
4917 case URX_JMP:
4918 fp->fPatIdx = opValue;
4919 break;
4921 case URX_FAIL:
4922 isMatch = FALSE;
4923 goto breakFromLoop;
4925 case URX_JMP_SAV:
4926 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4928 fp->fPatIdx = opValue; // Then JMP.
4929 break;
4931 case URX_JMP_SAV_X:
4932 // This opcode is used with (x)+, when x can match a zero length string.
4933 // Same as JMP_SAV, except conditional on the match having made forward progress.
4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4935 // data address of the input position at the start of the loop.
4936 {
4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4938 int32_t stoOp = (int32_t)pat[opValue-1];
4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4940 int32_t frameLoc = URX_VAL(stoOp);
4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4943 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4944 if (prevInputIdx < fp->fInputIdx) {
4945 // The match did make progress. Repeat the loop.
4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4947 fp->fPatIdx = opValue;
4948 fp->fExtra[frameLoc] = fp->fInputIdx;
4949 }
4950 // If the input position did not advance, we do nothing here,
4951 // execution will fall out of the loop.
4952 }
4953 break;
4955 case URX_CTR_INIT:
4956 {
4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4958 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4960 // Pick up the three extra operands that CTR_INIT has, and
4961 // skip the pattern location counter past
4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4963 fp->fPatIdx += 3;
4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4967 U_ASSERT(minCount>=0);
4968 U_ASSERT(maxCount>=minCount || maxCount==-1);
4969 U_ASSERT(loopLoc>=fp->fPatIdx);
4971 if (minCount == 0) {
4972 fp = StateSave(fp, loopLoc+1, status);
4973 }
4974 if (maxCount == -1) {
4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4976 } else if (maxCount == 0) {
4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4978 }
4979 }
4980 break;
4982 case URX_CTR_LOOP:
4983 {
4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4985 int32_t initOp = (int32_t)pat[opValue];
4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4988 int32_t minCount = (int32_t)pat[opValue+2];
4989 int32_t maxCount = (int32_t)pat[opValue+3];
4990 (*pCounter)++;
4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4992 U_ASSERT(*pCounter == maxCount);
4993 break;
4994 }
4995 if (*pCounter >= minCount) {
4996 if (maxCount == -1) {
4997 // Loop has no hard upper bound.
4998 // Check that it is progressing through the input, break if it is not.
4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5000 if (fp->fInputIdx == *pLastInputIdx) {
5001 break;
5002 } else {
5003 *pLastInputIdx = fp->fInputIdx;
5004 }
5005 }
5006 fp = StateSave(fp, fp->fPatIdx, status);
5007 }
5008 fp->fPatIdx = opValue + 4; // Loop back.
5009 }
5010 break;
5012 case URX_CTR_INIT_NG:
5013 {
5014 // Initialize a non-greedy loop
5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5016 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5018 // Pick up the three extra operands that CTR_INIT_NG has, and
5019 // skip the pattern location counter past
5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5021 fp->fPatIdx += 3;
5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5025 U_ASSERT(minCount>=0);
5026 U_ASSERT(maxCount>=minCount || maxCount==-1);
5027 U_ASSERT(loopLoc>fp->fPatIdx);
5028 if (maxCount == -1) {
5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5030 }
5032 if (minCount == 0) {
5033 if (maxCount != 0) {
5034 fp = StateSave(fp, fp->fPatIdx, status);
5035 }
5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5037 }
5038 }
5039 break;
5041 case URX_CTR_LOOP_NG:
5042 {
5043 // Non-greedy {min, max} loops
5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5045 int32_t initOp = (int32_t)pat[opValue];
5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5048 int32_t minCount = (int32_t)pat[opValue+2];
5049 int32_t maxCount = (int32_t)pat[opValue+3];
5051 (*pCounter)++;
5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5053 // The loop has matched the maximum permitted number of times.
5054 // Break out of here with no action. Matching will
5055 // continue with the following pattern.
5056 U_ASSERT(*pCounter == maxCount);
5057 break;
5058 }
5060 if (*pCounter < minCount) {
5061 // We haven't met the minimum number of matches yet.
5062 // Loop back for another one.
5063 fp->fPatIdx = opValue + 4; // Loop back.
5064 } else {
5065 // We do have the minimum number of matches.
5067 // If there is no upper bound on the loop iterations, check that the input index
5068 // is progressing, and stop the loop if it is not.
5069 if (maxCount == -1) {
5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5071 if (fp->fInputIdx == *pLastInputIdx) {
5072 break;
5073 }
5074 *pLastInputIdx = fp->fInputIdx;
5075 }
5077 // Loop Continuation: we will fall into the pattern following the loop
5078 // (non-greedy, don't execute loop body first), but first do
5079 // a state save to the top of the loop, so that a match failure
5080 // in the following pattern will try another iteration of the loop.
5081 fp = StateSave(fp, opValue + 4, status);
5082 }
5083 }
5084 break;
5086 case URX_STO_SP:
5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5088 fData[opValue] = fStack->size();
5089 break;
5091 case URX_LD_SP:
5092 {
5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5094 int32_t newStackSize = (int32_t)fData[opValue];
5095 U_ASSERT(newStackSize <= fStack->size());
5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5097 if (newFP == (int64_t *)fp) {
5098 break;
5099 }
5100 int32_t i;
5101 for (i=0; i<fFrameSize; i++) {
5102 newFP[i] = ((int64_t *)fp)[i];
5103 }
5104 fp = (REStackFrame *)newFP;
5105 fStack->setSize(newStackSize);
5106 }
5107 break;
5109 case URX_BACKREF:
5110 {
5111 U_ASSERT(opValue < fFrameSize);
5112 int64_t groupStartIdx = fp->fExtra[opValue];
5113 int64_t groupEndIdx = fp->fExtra[opValue+1];
5114 U_ASSERT(groupStartIdx <= groupEndIdx);
5115 int64_t inputIndex = fp->fInputIdx;
5116 if (groupStartIdx < 0) {
5117 // This capture group has not participated in the match thus far,
5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5119 break;
5120 }
5121 UBool success = TRUE;
5122 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5123 if (inputIndex >= fActiveLimit) {
5124 success = FALSE;
5125 fHitEnd = TRUE;
5126 break;
5127 }
5128 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5129 success = FALSE;
5130 break;
5131 }
5132 }
5133 if (success) {
5134 fp->fInputIdx = inputIndex;
5135 } else {
5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5137 }
5138 }
5139 break;
5141 case URX_BACKREF_I:
5142 {
5143 U_ASSERT(opValue < fFrameSize);
5144 int64_t groupStartIdx = fp->fExtra[opValue];
5145 int64_t groupEndIdx = fp->fExtra[opValue+1];
5146 U_ASSERT(groupStartIdx <= groupEndIdx);
5147 if (groupStartIdx < 0) {
5148 // This capture group has not participated in the match thus far,
5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5150 break;
5151 }
5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5155 // Note: if the capture group match was of an empty string the backref
5156 // match succeeds. Verified by testing: Perl matches succeed
5157 // in this case, so we do too.
5159 UBool success = TRUE;
5160 for (;;) {
5161 UChar32 captureGroupChar = captureGroupItr.next();
5162 if (captureGroupChar == U_SENTINEL) {
5163 success = TRUE;
5164 break;
5165 }
5166 UChar32 inputChar = inputItr.next();
5167 if (inputChar == U_SENTINEL) {
5168 success = FALSE;
5169 fHitEnd = TRUE;
5170 break;
5171 }
5172 if (inputChar != captureGroupChar) {
5173 success = FALSE;
5174 break;
5175 }
5176 }
5178 if (success && inputItr.inExpansion()) {
5179 // We otained a match by consuming part of a string obtained from
5180 // case-folding a single code point of the input text.
5181 // This does not count as an overall match.
5182 success = FALSE;
5183 }
5185 if (success) {
5186 fp->fInputIdx = inputItr.getIndex();
5187 } else {
5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5189 }
5190 }
5191 break;
5193 case URX_STO_INP_LOC:
5194 {
5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5196 fp->fExtra[opValue] = fp->fInputIdx;
5197 }
5198 break;
5200 case URX_JMPX:
5201 {
5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5203 fp->fPatIdx += 1;
5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5207 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5208 if (savedInputIdx < fp->fInputIdx) {
5209 fp->fPatIdx = opValue; // JMP
5210 } else {
5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5212 }
5213 }
5214 break;
5216 case URX_LA_START:
5217 {
5218 // Entering a lookahead block.
5219 // Save Stack Ptr, Input Pos.
5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5221 fData[opValue] = fStack->size();
5222 fData[opValue+1] = fp->fInputIdx;
5223 fActiveStart = fLookStart; // Set the match region change for
5224 fActiveLimit = fLookLimit; // transparent bounds.
5225 }
5226 break;
5228 case URX_LA_END:
5229 {
5230 // Leaving a look-ahead block.
5231 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5233 int32_t stackSize = fStack->size();
5234 int32_t newStackSize = (int32_t)fData[opValue];
5235 U_ASSERT(stackSize >= newStackSize);
5236 if (stackSize > newStackSize) {
5237 // Copy the current top frame back to the new (cut back) top frame.
5238 // This makes the capture groups from within the look-ahead
5239 // expression available.
5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5241 int32_t i;
5242 for (i=0; i<fFrameSize; i++) {
5243 newFP[i] = ((int64_t *)fp)[i];
5244 }
5245 fp = (REStackFrame *)newFP;
5246 fStack->setSize(newStackSize);
5247 }
5248 fp->fInputIdx = fData[opValue+1];
5250 // Restore the active region bounds in the input string; they may have
5251 // been changed because of transparent bounds on a Region.
5252 fActiveStart = fRegionStart;
5253 fActiveLimit = fRegionLimit;
5254 }
5255 break;
5257 case URX_ONECHAR_I:
5258 if (fp->fInputIdx < fActiveLimit) {
5259 UChar32 c;
5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5262 break;
5263 }
5264 } else {
5265 fHitEnd = TRUE;
5266 }
5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5268 break;
5270 case URX_STRING_I:
5271 // Case-insensitive test input against a literal string.
5272 // Strings require two slots in the compiled pattern, one for the
5273 // offset to the string text, and one for the length.
5274 // The compiled string has already been case folded.
5275 {
5276 const UChar *patternString = litText + opValue;
5278 op = (int32_t)pat[fp->fPatIdx];
5279 fp->fPatIdx++;
5280 opType = URX_TYPE(op);
5281 opValue = URX_VAL(op);
5282 U_ASSERT(opType == URX_STRING_LEN);
5283 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5285 UChar32 cText;
5286 UChar32 cPattern;
5287 UBool success = TRUE;
5288 int32_t patternStringIdx = 0;
5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5290 while (patternStringIdx < patternStringLen) {
5291 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5292 cText = inputIterator.next();
5293 if (cText != cPattern) {
5294 success = FALSE;
5295 if (cText == U_SENTINEL) {
5296 fHitEnd = TRUE;
5297 }
5298 break;
5299 }
5300 }
5301 if (inputIterator.inExpansion()) {
5302 success = FALSE;
5303 }
5305 if (success) {
5306 fp->fInputIdx = inputIterator.getIndex();
5307 } else {
5308 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5309 }
5310 }
5311 break;
5313 case URX_LB_START:
5314 {
5315 // Entering a look-behind block.
5316 // Save Stack Ptr, Input Pos.
5317 // TODO: implement transparent bounds. Ticket #6067
5318 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5319 fData[opValue] = fStack->size();
5320 fData[opValue+1] = fp->fInputIdx;
5321 // Init the variable containing the start index for attempted matches.
5322 fData[opValue+2] = -1;
5323 // Save input string length, then reset to pin any matches to end at
5324 // the current position.
5325 fData[opValue+3] = fActiveLimit;
5326 fActiveLimit = fp->fInputIdx;
5327 }
5328 break;
5331 case URX_LB_CONT:
5332 {
5333 // Positive Look-Behind, at top of loop checking for matches of LB expression
5334 // at all possible input starting positions.
5336 // Fetch the min and max possible match lengths. They are the operands
5337 // of this op in the pattern.
5338 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5340 U_ASSERT(minML <= maxML);
5341 U_ASSERT(minML >= 0);
5343 // Fetch (from data) the last input index where a match was attempted.
5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5345 int64_t *lbStartIdx = &fData[opValue+2];
5346 if (*lbStartIdx < 0) {
5347 // First time through loop.
5348 *lbStartIdx = fp->fInputIdx - minML;
5349 } else {
5350 // 2nd through nth time through the loop.
5351 // Back up start position for match by one.
5352 if (*lbStartIdx == 0) {
5353 (*lbStartIdx)--;
5354 } else {
5355 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5356 }
5357 }
5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5360 // We have tried all potential match starting points without
5361 // getting a match. Backtrack out, and out of the
5362 // Look Behind altogether.
5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5364 int64_t restoreInputLen = fData[opValue+3];
5365 U_ASSERT(restoreInputLen >= fActiveLimit);
5366 U_ASSERT(restoreInputLen <= fInputLength);
5367 fActiveLimit = restoreInputLen;
5368 break;
5369 }
5371 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5372 // (successful match will fall off the end of the loop.)
5373 fp = StateSave(fp, fp->fPatIdx-3, status);
5374 fp->fInputIdx = *lbStartIdx;
5375 }
5376 break;
5378 case URX_LB_END:
5379 // End of a look-behind block, after a successful match.
5380 {
5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5382 if (fp->fInputIdx != fActiveLimit) {
5383 // The look-behind expression matched, but the match did not
5384 // extend all the way to the point that we are looking behind from.
5385 // FAIL out of here, which will take us back to the LB_CONT, which
5386 // will retry the match starting at another position or fail
5387 // the look-behind altogether, whichever is appropriate.
5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5389 break;
5390 }
5392 // Look-behind match is good. Restore the orignal input string length,
5393 // which had been truncated to pin the end of the lookbehind match to the
5394 // position being looked-behind.
5395 int64_t originalInputLen = fData[opValue+3];
5396 U_ASSERT(originalInputLen >= fActiveLimit);
5397 U_ASSERT(originalInputLen <= fInputLength);
5398 fActiveLimit = originalInputLen;
5399 }
5400 break;
5403 case URX_LBN_CONT:
5404 {
5405 // Negative Look-Behind, at top of loop checking for matches of LB expression
5406 // at all possible input starting positions.
5408 // Fetch the extra parameters of this op.
5409 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5412 continueLoc = URX_VAL(continueLoc);
5413 U_ASSERT(minML <= maxML);
5414 U_ASSERT(minML >= 0);
5415 U_ASSERT(continueLoc > fp->fPatIdx);
5417 // Fetch (from data) the last input index where a match was attempted.
5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5419 int64_t *lbStartIdx = &fData[opValue+2];
5420 if (*lbStartIdx < 0) {
5421 // First time through loop.
5422 *lbStartIdx = fp->fInputIdx - minML;
5423 } else {
5424 // 2nd through nth time through the loop.
5425 // Back up start position for match by one.
5426 if (*lbStartIdx == 0) {
5427 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5428 } else {
5429 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5430 }
5431 }
5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5434 // We have tried all potential match starting points without
5435 // getting a match, which means that the negative lookbehind as
5436 // a whole has succeeded. Jump forward to the continue location
5437 int64_t restoreInputLen = fData[opValue+3];
5438 U_ASSERT(restoreInputLen >= fActiveLimit);
5439 U_ASSERT(restoreInputLen <= fInputLength);
5440 fActiveLimit = restoreInputLen;
5441 fp->fPatIdx = continueLoc;
5442 break;
5443 }
5445 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5446 // (successful match will cause a FAIL out of the loop altogether.)
5447 fp = StateSave(fp, fp->fPatIdx-4, status);
5448 fp->fInputIdx = *lbStartIdx;
5449 }
5450 break;
5452 case URX_LBN_END:
5453 // End of a negative look-behind block, after a successful match.
5454 {
5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5456 if (fp->fInputIdx != fActiveLimit) {
5457 // The look-behind expression matched, but the match did not
5458 // extend all the way to the point that we are looking behind from.
5459 // FAIL out of here, which will take us back to the LB_CONT, which
5460 // will retry the match starting at another position or succeed
5461 // the look-behind altogether, whichever is appropriate.
5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5463 break;
5464 }
5466 // Look-behind expression matched, which means look-behind test as
5467 // a whole Fails
5469 // Restore the orignal input string length, which had been truncated
5470 // inorder to pin the end of the lookbehind match
5471 // to the position being looked-behind.
5472 int64_t originalInputLen = fData[opValue+3];
5473 U_ASSERT(originalInputLen >= fActiveLimit);
5474 U_ASSERT(originalInputLen <= fInputLength);
5475 fActiveLimit = originalInputLen;
5477 // Restore original stack position, discarding any state saved
5478 // by the successful pattern match.
5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5480 int32_t newStackSize = (int32_t)fData[opValue];
5481 U_ASSERT(fStack->size() > newStackSize);
5482 fStack->setSize(newStackSize);
5484 // FAIL, which will take control back to someplace
5485 // prior to entering the look-behind test.
5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5487 }
5488 break;
5491 case URX_LOOP_SR_I:
5492 // Loop Initialization for the optimized implementation of
5493 // [some character set]*
5494 // This op scans through all matching input.
5495 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5496 {
5497 U_ASSERT(opValue > 0 && opValue < sets->size());
5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5501 // Loop through input, until either the input is exhausted or
5502 // we reach a character that is not a member of the set.
5503 int32_t ix = (int32_t)fp->fInputIdx;
5504 for (;;) {
5505 if (ix >= fActiveLimit) {
5506 fHitEnd = TRUE;
5507 break;
5508 }
5509 UChar32 c;
5510 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5511 if (c<256) {
5512 if (s8->contains(c) == FALSE) {
5513 U16_BACK_1(inputBuf, 0, ix);
5514 break;
5515 }
5516 } else {
5517 if (s->contains(c) == FALSE) {
5518 U16_BACK_1(inputBuf, 0, ix);
5519 break;
5520 }
5521 }
5522 }
5524 // If there were no matching characters, skip over the loop altogether.
5525 // The loop doesn't run at all, a * op always succeeds.
5526 if (ix == fp->fInputIdx) {
5527 fp->fPatIdx++; // skip the URX_LOOP_C op.
5528 break;
5529 }
5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5532 // must follow. It's operand is the stack location
5533 // that holds the starting input index for the match of this [set]*
5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5536 int32_t stackLoc = URX_VAL(loopcOp);
5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5538 fp->fExtra[stackLoc] = fp->fInputIdx;
5539 fp->fInputIdx = ix;
5541 // Save State to the URX_LOOP_C op that follows this one,
5542 // so that match failures in the following code will return to there.
5543 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5544 fp = StateSave(fp, fp->fPatIdx, status);
5545 fp->fPatIdx++;
5546 }
5547 break;
5550 case URX_LOOP_DOT_I:
5551 // Loop Initialization for the optimized implementation of .*
5552 // This op scans through all remaining input.
5553 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5554 {
5555 // Loop through input until the input is exhausted (we reach an end-of-line)
5556 // In DOTALL mode, we can just go straight to the end of the input.
5557 int32_t ix;
5558 if ((opValue & 1) == 1) {
5559 // Dot-matches-All mode. Jump straight to the end of the string.
5560 ix = (int32_t)fActiveLimit;
5561 fHitEnd = TRUE;
5562 } else {
5563 // NOT DOT ALL mode. Line endings do not match '.'
5564 // Scan forward until a line ending or end of input.
5565 ix = (int32_t)fp->fInputIdx;
5566 for (;;) {
5567 if (ix >= fActiveLimit) {
5568 fHitEnd = TRUE;
5569 break;
5570 }
5571 UChar32 c;
5572 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5573 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5574 if ((c == 0x0a) || // 0x0a is newline in both modes.
5575 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5577 // char is a line ending. Put the input pos back to the
5578 // line ending char, and exit the scanning loop.
5579 U16_BACK_1(inputBuf, 0, ix);
5580 break;
5581 }
5582 }
5583 }
5584 }
5586 // If there were no matching characters, skip over the loop altogether.
5587 // The loop doesn't run at all, a * op always succeeds.
5588 if (ix == fp->fInputIdx) {
5589 fp->fPatIdx++; // skip the URX_LOOP_C op.
5590 break;
5591 }
5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5594 // must follow. It's operand is the stack location
5595 // that holds the starting input index for the match of this .*
5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5598 int32_t stackLoc = URX_VAL(loopcOp);
5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5600 fp->fExtra[stackLoc] = fp->fInputIdx;
5601 fp->fInputIdx = ix;
5603 // Save State to the URX_LOOP_C op that follows this one,
5604 // so that match failures in the following code will return to there.
5605 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5606 fp = StateSave(fp, fp->fPatIdx, status);
5607 fp->fPatIdx++;
5608 }
5609 break;
5612 case URX_LOOP_C:
5613 {
5614 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5615 backSearchIndex = (int32_t)fp->fExtra[opValue];
5616 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5617 if (backSearchIndex == fp->fInputIdx) {
5618 // We've backed up the input idx to the point that the loop started.
5619 // The loop is done. Leave here without saving state.
5620 // Subsequent failures won't come back here.
5621 break;
5622 }
5623 // Set up for the next iteration of the loop, with input index
5624 // backed up by one from the last time through,
5625 // and a state save to this instruction in case the following code fails again.
5626 // (We're going backwards because this loop emulates stack unwinding, not
5627 // the initial scan forward.)
5628 U_ASSERT(fp->fInputIdx > 0);
5629 UChar32 prevC;
5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5632 if (prevC == 0x0a &&
5633 fp->fInputIdx > backSearchIndex &&
5634 inputBuf[fp->fInputIdx-1] == 0x0d) {
5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5637 // .*, stepping back over CRLF pair.
5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5639 }
5640 }
5643 fp = StateSave(fp, fp->fPatIdx-1, status);
5644 }
5645 break;
5649 default:
5650 // Trouble. The compiled pattern contains an entry with an
5651 // unrecognized type tag.
5652 U_ASSERT(FALSE);
5653 }
5655 if (U_FAILURE(status)) {
5656 isMatch = FALSE;
5657 break;
5658 }
5659 }
5661 breakFromLoop:
5662 fMatch = isMatch;
5663 if (isMatch) {
5664 fLastMatchEnd = fMatchEnd;
5665 fMatchStart = startIdx;
5666 fMatchEnd = fp->fInputIdx;
5667 if (fTraceDebug) {
5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd));
5669 }
5670 }
5671 else
5672 {
5673 if (fTraceDebug) {
5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5675 }
5676 }
5678 fFrame = fp; // The active stack frame when the engine stopped.
5679 // Contains the capture group results that we need to
5680 // access later.
5682 return;
5683 }
5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5688 U_NAMESPACE_END
5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS