js/src/yarr/YarrParser.h

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0 3 *
michael@0 4 * Copyright (C) 2009 Apple Inc. All rights reserved.
michael@0 5 *
michael@0 6 * Redistribution and use in source and binary forms, with or without
michael@0 7 * modification, are permitted provided that the following conditions
michael@0 8 * are met:
michael@0 9 * 1. Redistributions of source code must retain the above copyright
michael@0 10 * notice, this list of conditions and the following disclaimer.
michael@0 11 * 2. Redistributions in binary form must reproduce the above copyright
michael@0 12 * notice, this list of conditions and the following disclaimer in the
michael@0 13 * documentation and/or other materials provided with the distribution.
michael@0 14 *
michael@0 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
michael@0 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
michael@0 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
michael@0 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
michael@0 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
michael@0 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
michael@0 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
michael@0 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
michael@0 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
michael@0 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
michael@0 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
michael@0 26 */
michael@0 27
michael@0 28 #ifndef yarr_YarrParser_h
michael@0 29 #define yarr_YarrParser_h
michael@0 30
michael@0 31 #include "yarr/Yarr.h"
michael@0 32
michael@0 33 namespace JSC { namespace Yarr {
michael@0 34
michael@0 35 enum BuiltInCharacterClassID {
michael@0 36 DigitClassID,
michael@0 37 SpaceClassID,
michael@0 38 WordClassID,
michael@0 39 NewlineClassID
michael@0 40 };
michael@0 41
michael@0 42 // The Parser class should not be used directly - only via the Yarr::parse() method.
michael@0 43 template<class Delegate, typename CharType>
michael@0 44 class Parser {
michael@0 45 private:
michael@0 46 template<class FriendDelegate>
michael@0 47 friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
michael@0 48
michael@0 49 /*
michael@0 50 * CharacterClassParserDelegate:
michael@0 51 *
michael@0 52 * The class CharacterClassParserDelegate is used in the parsing of character
michael@0 53 * classes. This class handles detection of character ranges. This class
michael@0 54 * implements enough of the delegate interface such that it can be passed to
michael@0 55 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
michael@0 56 * to perform the parsing of escape characters in character sets.
michael@0 57 */
michael@0 58 class CharacterClassParserDelegate {
michael@0 59 public:
michael@0 60 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
michael@0 61 : m_delegate(delegate)
michael@0 62 , m_err(err)
michael@0 63 , m_state(Empty)
michael@0 64 , m_character(0)
michael@0 65 {
michael@0 66 }
michael@0 67
michael@0 68 /*
michael@0 69 * begin():
michael@0 70 *
michael@0 71 * Called at beginning of construction.
michael@0 72 */
michael@0 73 void begin(bool invert)
michael@0 74 {
michael@0 75 m_delegate.atomCharacterClassBegin(invert);
michael@0 76 }
michael@0 77
michael@0 78 /*
michael@0 79 * atomPatternCharacter():
michael@0 80 *
michael@0 81 * This method is called either from parseCharacterClass() (for an unescaped
michael@0 82 * character in a character class), or from parseEscape(). In the former case
michael@0 83 * the value true will be passed for the argument 'hyphenIsRange', and in this
michael@0 84 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
michael@0 85 * is different to /[a\-z]/).
michael@0 86 */
michael@0 87 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
michael@0 88 {
michael@0 89 switch (m_state) {
michael@0 90 case AfterCharacterClass:
michael@0 91 // Following a builtin character class we need look out for a hyphen.
michael@0 92 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
michael@0 93 // If we see a hyphen following a charater class then unlike usual
michael@0 94 // we'll report it to the delegate immediately, and put ourself into
michael@0 95 // a poisoned state. Any following calls to add another character or
michael@0 96 // character class will result in an error. (A hypen following a
michael@0 97 // character-class is itself valid, but only at the end of a regex).
michael@0 98 if (hyphenIsRange && ch == '-') {
michael@0 99 m_delegate.atomCharacterClassAtom('-');
michael@0 100 m_state = AfterCharacterClassHyphen;
michael@0 101 return;
michael@0 102 }
michael@0 103 // Otherwise just fall through - cached character so treat this as Empty.
michael@0 104
michael@0 105 case Empty:
michael@0 106 m_character = ch;
michael@0 107 m_state = CachedCharacter;
michael@0 108 return;
michael@0 109
michael@0 110 case CachedCharacter:
michael@0 111 if (hyphenIsRange && ch == '-')
michael@0 112 m_state = CachedCharacterHyphen;
michael@0 113 else {
michael@0 114 m_delegate.atomCharacterClassAtom(m_character);
michael@0 115 m_character = ch;
michael@0 116 }
michael@0 117 return;
michael@0 118
michael@0 119 case CachedCharacterHyphen:
michael@0 120 if (ch < m_character) {
michael@0 121 m_err = CharacterClassOutOfOrder;
michael@0 122 return;
michael@0 123 }
michael@0 124 m_delegate.atomCharacterClassRange(m_character, ch);
michael@0 125 m_state = Empty;
michael@0 126 return;
michael@0 127
michael@0 128 case AfterCharacterClassHyphen:
michael@0 129 m_delegate.atomCharacterClassAtom(ch);
michael@0 130 m_state = Empty;
michael@0 131 return;
michael@0 132 }
michael@0 133 }
michael@0 134
michael@0 135 /*
michael@0 136 * atomBuiltInCharacterClass():
michael@0 137 *
michael@0 138 * Adds a built-in character class, called by parseEscape().
michael@0 139 */
michael@0 140 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
michael@0 141 {
michael@0 142 switch (m_state) {
michael@0 143 case CachedCharacter:
michael@0 144 // Flush the currently cached character, then fall through.
michael@0 145 m_delegate.atomCharacterClassAtom(m_character);
michael@0 146
michael@0 147 case Empty:
michael@0 148 case AfterCharacterClass:
michael@0 149 m_state = AfterCharacterClass;
michael@0 150 m_delegate.atomCharacterClassBuiltIn(classID, invert);
michael@0 151 return;
michael@0 152
michael@0 153 case CachedCharacterHyphen:
michael@0 154 // Error! We have a range that looks like [x-\d]. We require
michael@0 155 // the end of the range to be a single character.
michael@0 156 m_err = CharacterClassInvalidRange;
michael@0 157 return;
michael@0 158 case AfterCharacterClassHyphen:
michael@0 159 m_delegate.atomCharacterClassBuiltIn(classID, invert);
michael@0 160 m_state = Empty;
michael@0 161 return;
michael@0 162 }
michael@0 163 }
michael@0 164
michael@0 165 /*
michael@0 166 * end():
michael@0 167 *
michael@0 168 * Called at end of construction.
michael@0 169 */
michael@0 170 void end()
michael@0 171 {
michael@0 172 if (m_state == CachedCharacter)
michael@0 173 m_delegate.atomCharacterClassAtom(m_character);
michael@0 174 else if (m_state == CachedCharacterHyphen) {
michael@0 175 m_delegate.atomCharacterClassAtom(m_character);
michael@0 176 m_delegate.atomCharacterClassAtom('-');
michael@0 177 }
michael@0 178 m_delegate.atomCharacterClassEnd();
michael@0 179 }
michael@0 180
michael@0 181 // parseEscape() should never call these delegate methods when
michael@0 182 // invoked with inCharacterClass set.
michael@0 183 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
michael@0 184 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
michael@0 185
michael@0 186 private:
michael@0 187 Delegate& m_delegate;
michael@0 188 ErrorCode& m_err;
michael@0 189 enum CharacterClassConstructionState {
michael@0 190 Empty,
michael@0 191 CachedCharacter,
michael@0 192 CachedCharacterHyphen,
michael@0 193 AfterCharacterClass,
michael@0 194 AfterCharacterClassHyphen
michael@0 195 } m_state;
michael@0 196 UChar m_character;
michael@0 197 };
michael@0 198
michael@0 199 Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
michael@0 200 : m_delegate(delegate)
michael@0 201 , m_backReferenceLimit(backReferenceLimit)
michael@0 202 , m_err(NoError)
michael@0 203 , m_data(pattern.chars())
michael@0 204 , m_size(pattern.length())
michael@0 205 , m_index(0)
michael@0 206 , m_parenthesesNestingDepth(0)
michael@0 207 {
michael@0 208 }
michael@0 209
michael@0 210 /*
michael@0 211 * parseEscape():
michael@0 212 *
michael@0 213 * Helper for parseTokens() AND parseCharacterClass().
michael@0 214 * Unlike the other parser methods, this function does not report tokens
michael@0 215 * directly to the member delegate (m_delegate), instead tokens are
michael@0 216 * emitted to the delegate provided as an argument. In the case of atom
michael@0 217 * escapes, parseTokens() will call parseEscape() passing m_delegate as
michael@0 218 * an argument, and as such the escape will be reported to the delegate.
michael@0 219 *
michael@0 220 * However this method may also be used by parseCharacterClass(), in which
michael@0 221 * case a CharacterClassParserDelegate will be passed as the delegate that
michael@0 222 * tokens should be added to. A boolean flag is also provided to indicate
michael@0 223 * whether that an escape in a CharacterClass is being parsed (some parsing
michael@0 224 * rules change in this context).
michael@0 225 *
michael@0 226 * The boolean value returned by this method indicates whether the token
michael@0 227 * parsed was an atom (outside of a characted class \b and \B will be
michael@0 228 * interpreted as assertions).
michael@0 229 */
michael@0 230 template<bool inCharacterClass, class EscapeDelegate>
michael@0 231 bool parseEscape(EscapeDelegate& delegate)
michael@0 232 {
michael@0 233 ASSERT(!m_err);
michael@0 234 ASSERT(peek() == '\\');
michael@0 235 consume();
michael@0 236
michael@0 237 if (atEndOfPattern()) {
michael@0 238 m_err = EscapeUnterminated;
michael@0 239 return false;
michael@0 240 }
michael@0 241
michael@0 242 switch (peek()) {
michael@0 243 // Assertions
michael@0 244 case 'b':
michael@0 245 consume();
michael@0 246 if (inCharacterClass)
michael@0 247 delegate.atomPatternCharacter('\b');
michael@0 248 else {
michael@0 249 delegate.assertionWordBoundary(false);
michael@0 250 return false;
michael@0 251 }
michael@0 252 break;
michael@0 253 case 'B':
michael@0 254 consume();
michael@0 255 if (inCharacterClass)
michael@0 256 delegate.atomPatternCharacter('B');
michael@0 257 else {
michael@0 258 delegate.assertionWordBoundary(true);
michael@0 259 return false;
michael@0 260 }
michael@0 261 break;
michael@0 262
michael@0 263 // CharacterClassEscape
michael@0 264 case 'd':
michael@0 265 consume();
michael@0 266 delegate.atomBuiltInCharacterClass(DigitClassID, false);
michael@0 267 break;
michael@0 268 case 's':
michael@0 269 consume();
michael@0 270 delegate.atomBuiltInCharacterClass(SpaceClassID, false);
michael@0 271 break;
michael@0 272 case 'w':
michael@0 273 consume();
michael@0 274 delegate.atomBuiltInCharacterClass(WordClassID, false);
michael@0 275 break;
michael@0 276 case 'D':
michael@0 277 consume();
michael@0 278 delegate.atomBuiltInCharacterClass(DigitClassID, true);
michael@0 279 break;
michael@0 280 case 'S':
michael@0 281 consume();
michael@0 282 delegate.atomBuiltInCharacterClass(SpaceClassID, true);
michael@0 283 break;
michael@0 284 case 'W':
michael@0 285 consume();
michael@0 286 delegate.atomBuiltInCharacterClass(WordClassID, true);
michael@0 287 break;
michael@0 288
michael@0 289 // DecimalEscape
michael@0 290 case '1':
michael@0 291 case '2':
michael@0 292 case '3':
michael@0 293 case '4':
michael@0 294 case '5':
michael@0 295 case '6':
michael@0 296 case '7':
michael@0 297 case '8':
michael@0 298 case '9': {
michael@0 299 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
michael@0 300 // First, try to parse this as backreference.
michael@0 301 if (!inCharacterClass) {
michael@0 302 ParseState state = saveState();
michael@0 303
michael@0 304 unsigned backReference;
michael@0 305 if (!consumeNumber(backReference))
michael@0 306 break;
michael@0 307 if (backReference <= m_backReferenceLimit) {
michael@0 308 delegate.atomBackReference(backReference);
michael@0 309 break;
michael@0 310 }
michael@0 311
michael@0 312 restoreState(state);
michael@0 313 }
michael@0 314
michael@0 315 // Not a backreference, and not octal.
michael@0 316 if (peek() >= '8') {
michael@0 317 delegate.atomPatternCharacter('\\');
michael@0 318 break;
michael@0 319 }
michael@0 320
michael@0 321 // Fall-through to handle this as an octal escape.
michael@0 322 }
michael@0 323
michael@0 324 // Octal escape
michael@0 325 case '0':
michael@0 326 delegate.atomPatternCharacter(consumeOctal());
michael@0 327 break;
michael@0 328
michael@0 329 // ControlEscape
michael@0 330 case 'f':
michael@0 331 consume();
michael@0 332 delegate.atomPatternCharacter('\f');
michael@0 333 break;
michael@0 334 case 'n':
michael@0 335 consume();
michael@0 336 delegate.atomPatternCharacter('\n');
michael@0 337 break;
michael@0 338 case 'r':
michael@0 339 consume();
michael@0 340 delegate.atomPatternCharacter('\r');
michael@0 341 break;
michael@0 342 case 't':
michael@0 343 consume();
michael@0 344 delegate.atomPatternCharacter('\t');
michael@0 345 break;
michael@0 346 case 'v':
michael@0 347 consume();
michael@0 348 delegate.atomPatternCharacter('\v');
michael@0 349 break;
michael@0 350
michael@0 351 // ControlLetter
michael@0 352 case 'c': {
michael@0 353 ParseState state = saveState();
michael@0 354 consume();
michael@0 355 if (!atEndOfPattern()) {
michael@0 356 int control = consume();
michael@0 357
michael@0 358 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
michael@0 359 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
michael@0 360 delegate.atomPatternCharacter(control & 0x1f);
michael@0 361 break;
michael@0 362 }
michael@0 363 }
michael@0 364 restoreState(state);
michael@0 365 delegate.atomPatternCharacter('\\');
michael@0 366 break;
michael@0 367 }
michael@0 368
michael@0 369 // HexEscape
michael@0 370 case 'x': {
michael@0 371 consume();
michael@0 372 int x = tryConsumeHex(2);
michael@0 373 if (x == -1)
michael@0 374 delegate.atomPatternCharacter('x');
michael@0 375 else
michael@0 376 delegate.atomPatternCharacter(x);
michael@0 377 break;
michael@0 378 }
michael@0 379
michael@0 380 // UnicodeEscape
michael@0 381 case 'u': {
michael@0 382 consume();
michael@0 383 int u = tryConsumeHex(4);
michael@0 384 if (u == -1)
michael@0 385 delegate.atomPatternCharacter('u');
michael@0 386 else
michael@0 387 delegate.atomPatternCharacter(u);
michael@0 388 break;
michael@0 389 }
michael@0 390
michael@0 391 // IdentityEscape
michael@0 392 default:
michael@0 393 delegate.atomPatternCharacter(consume());
michael@0 394 }
michael@0 395
michael@0 396 return true;
michael@0 397 }
michael@0 398
michael@0 399 /*
michael@0 400 * parseAtomEscape(), parseCharacterClassEscape():
michael@0 401 *
michael@0 402 * These methods alias to parseEscape().
michael@0 403 */
michael@0 404 bool parseAtomEscape()
michael@0 405 {
michael@0 406 return parseEscape<false>(m_delegate);
michael@0 407 }
michael@0 408 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
michael@0 409 {
michael@0 410 parseEscape<true>(delegate);
michael@0 411 }
michael@0 412
michael@0 413 /*
michael@0 414 * parseCharacterClass():
michael@0 415 *
michael@0 416 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
michael@0 417 * to an instance of CharacterClassParserDelegate, to describe the character class to the
michael@0 418 * delegate.
michael@0 419 */
michael@0 420 void parseCharacterClass()
michael@0 421 {
michael@0 422 ASSERT(!m_err);
michael@0 423 ASSERT(peek() == '[');
michael@0 424 consume();
michael@0 425
michael@0 426 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
michael@0 427
michael@0 428 characterClassConstructor.begin(tryConsume('^'));
michael@0 429
michael@0 430 while (!atEndOfPattern()) {
michael@0 431 switch (peek()) {
michael@0 432 case ']':
michael@0 433 consume();
michael@0 434 characterClassConstructor.end();
michael@0 435 return;
michael@0 436
michael@0 437 case '\\':
michael@0 438 parseCharacterClassEscape(characterClassConstructor);
michael@0 439 break;
michael@0 440
michael@0 441 default:
michael@0 442 characterClassConstructor.atomPatternCharacter(consume(), true);
michael@0 443 }
michael@0 444
michael@0 445 if (m_err)
michael@0 446 return;
michael@0 447 }
michael@0 448
michael@0 449 m_err = CharacterClassUnmatched;
michael@0 450 }
michael@0 451
michael@0 452 /*
michael@0 453 * parseParenthesesBegin():
michael@0 454 *
michael@0 455 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
michael@0 456 */
michael@0 457 void parseParenthesesBegin()
michael@0 458 {
michael@0 459 ASSERT(!m_err);
michael@0 460 ASSERT(peek() == '(');
michael@0 461 consume();
michael@0 462
michael@0 463 if (tryConsume('?')) {
michael@0 464 if (atEndOfPattern()) {
michael@0 465 m_err = ParenthesesTypeInvalid;
michael@0 466 return;
michael@0 467 }
michael@0 468
michael@0 469 switch (consume()) {
michael@0 470 case ':':
michael@0 471 m_delegate.atomParenthesesSubpatternBegin(false);
michael@0 472 break;
michael@0 473
michael@0 474 case '=':
michael@0 475 m_delegate.atomParentheticalAssertionBegin();
michael@0 476 break;
michael@0 477
michael@0 478 case '!':
michael@0 479 m_delegate.atomParentheticalAssertionBegin(true);
michael@0 480 break;
michael@0 481
michael@0 482 default:
michael@0 483 m_err = ParenthesesTypeInvalid;
michael@0 484 }
michael@0 485 } else
michael@0 486 m_delegate.atomParenthesesSubpatternBegin();
michael@0 487
michael@0 488 ++m_parenthesesNestingDepth;
michael@0 489 }
michael@0 490
michael@0 491 /*
michael@0 492 * parseParenthesesEnd():
michael@0 493 *
michael@0 494 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
michael@0 495 */
michael@0 496 void parseParenthesesEnd()
michael@0 497 {
michael@0 498 ASSERT(!m_err);
michael@0 499 ASSERT(peek() == ')');
michael@0 500 consume();
michael@0 501
michael@0 502 if (m_parenthesesNestingDepth > 0)
michael@0 503 m_delegate.atomParenthesesEnd();
michael@0 504 else
michael@0 505 m_err = ParenthesesUnmatched;
michael@0 506
michael@0 507 --m_parenthesesNestingDepth;
michael@0 508 }
michael@0 509
michael@0 510 /*
michael@0 511 * parseQuantifier():
michael@0 512 *
michael@0 513 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
michael@0 514 */
michael@0 515 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
michael@0 516 {
michael@0 517 ASSERT(!m_err);
michael@0 518 ASSERT(min <= max);
michael@0 519
michael@0 520 if (min == UINT_MAX) {
michael@0 521 m_err = QuantifierTooLarge;
michael@0 522 return;
michael@0 523 }
michael@0 524
michael@0 525 if (lastTokenWasAnAtom)
michael@0 526 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
michael@0 527 else
michael@0 528 m_err = QuantifierWithoutAtom;
michael@0 529 }
michael@0 530
michael@0 531 /*
michael@0 532 * parseTokens():
michael@0 533 *
michael@0 534 * This method loops over the input pattern reporting tokens to the delegate.
michael@0 535 * The method returns when a parse error is detected, or the end of the pattern
michael@0 536 * is reached. One piece of state is tracked around the loop, which is whether
michael@0 537 * the last token passed to the delegate was an atom (this is necessary to detect
michael@0 538 * a parse error when a quantifier provided without an atom to quantify).
michael@0 539 */
michael@0 540 void parseTokens()
michael@0 541 {
michael@0 542 bool lastTokenWasAnAtom = false;
michael@0 543
michael@0 544 while (!atEndOfPattern()) {
michael@0 545 switch (peek()) {
michael@0 546 case '|':
michael@0 547 consume();
michael@0 548 m_delegate.disjunction();
michael@0 549 lastTokenWasAnAtom = false;
michael@0 550 break;
michael@0 551
michael@0 552 case '(':
michael@0 553 parseParenthesesBegin();
michael@0 554 lastTokenWasAnAtom = false;
michael@0 555 break;
michael@0 556
michael@0 557 case ')':
michael@0 558 parseParenthesesEnd();
michael@0 559 lastTokenWasAnAtom = true;
michael@0 560 break;
michael@0 561
michael@0 562 case '^':
michael@0 563 consume();
michael@0 564 m_delegate.assertionBOL();
michael@0 565 lastTokenWasAnAtom = false;
michael@0 566 break;
michael@0 567
michael@0 568 case '$':
michael@0 569 consume();
michael@0 570 m_delegate.assertionEOL();
michael@0 571 lastTokenWasAnAtom = false;
michael@0 572 break;
michael@0 573
michael@0 574 case '.':
michael@0 575 consume();
michael@0 576 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
michael@0 577 lastTokenWasAnAtom = true;
michael@0 578 break;
michael@0 579
michael@0 580 case '[':
michael@0 581 parseCharacterClass();
michael@0 582 lastTokenWasAnAtom = true;
michael@0 583 break;
michael@0 584
michael@0 585 case '\\':
michael@0 586 lastTokenWasAnAtom = parseAtomEscape();
michael@0 587 break;
michael@0 588
michael@0 589 case '*':
michael@0 590 consume();
michael@0 591 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
michael@0 592 lastTokenWasAnAtom = false;
michael@0 593 break;
michael@0 594
michael@0 595 case '+':
michael@0 596 consume();
michael@0 597 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
michael@0 598 lastTokenWasAnAtom = false;
michael@0 599 break;
michael@0 600
michael@0 601 case '?':
michael@0 602 consume();
michael@0 603 parseQuantifier(lastTokenWasAnAtom, 0, 1);
michael@0 604 lastTokenWasAnAtom = false;
michael@0 605 break;
michael@0 606
michael@0 607 case '{': {
michael@0 608 ParseState state = saveState();
michael@0 609
michael@0 610 consume();
michael@0 611 if (peekIsDigit()) {
michael@0 612 unsigned min;
michael@0 613 if (!consumeNumber(min))
michael@0 614 break;
michael@0 615
michael@0 616 unsigned max = min;
michael@0 617 if (tryConsume(',')) {
michael@0 618 if (peekIsDigit()) {
michael@0 619 if (!consumeNumber(max))
michael@0 620 break;
michael@0 621 } else {
michael@0 622 max = quantifyInfinite;
michael@0 623 }
michael@0 624 }
michael@0 625
michael@0 626 if (tryConsume('}')) {
michael@0 627 if (min <= max)
michael@0 628 parseQuantifier(lastTokenWasAnAtom, min, max);
michael@0 629 else
michael@0 630 m_err = QuantifierOutOfOrder;
michael@0 631 lastTokenWasAnAtom = false;
michael@0 632 break;
michael@0 633 }
michael@0 634 }
michael@0 635
michael@0 636 restoreState(state);
michael@0 637 } // if we did not find a complete quantifer, fall through to the default case.
michael@0 638
michael@0 639 default:
michael@0 640 m_delegate.atomPatternCharacter(consume());
michael@0 641 lastTokenWasAnAtom = true;
michael@0 642 }
michael@0 643
michael@0 644 if (m_err)
michael@0 645 return;
michael@0 646 }
michael@0 647
michael@0 648 if (m_parenthesesNestingDepth > 0)
michael@0 649 m_err = MissingParentheses;
michael@0 650 }
michael@0 651
michael@0 652 /*
michael@0 653 * parse():
michael@0 654 *
michael@0 655 * This method calls parseTokens() to parse over the input and converts any
michael@0 656 * error code to a const char* for a result.
michael@0 657 */
michael@0 658 ErrorCode parse()
michael@0 659 {
michael@0 660 if (m_size > MAX_PATTERN_SIZE)
michael@0 661 m_err = PatternTooLarge;
michael@0 662 else
michael@0 663 parseTokens();
michael@0 664 ASSERT(atEndOfPattern() || m_err);
michael@0 665
michael@0 666 return m_err;
michael@0 667 }
michael@0 668
michael@0 669 // Misc helper functions:
michael@0 670
michael@0 671 typedef unsigned ParseState;
michael@0 672
michael@0 673 ParseState saveState()
michael@0 674 {
michael@0 675 return m_index;
michael@0 676 }
michael@0 677
michael@0 678 void restoreState(ParseState state)
michael@0 679 {
michael@0 680 m_index = state;
michael@0 681 }
michael@0 682
michael@0 683 bool atEndOfPattern()
michael@0 684 {
michael@0 685 ASSERT(m_index <= m_size);
michael@0 686 return m_index == m_size;
michael@0 687 }
michael@0 688
michael@0 689 int peek()
michael@0 690 {
michael@0 691 ASSERT(m_index < m_size);
michael@0 692 return m_data[m_index];
michael@0 693 }
michael@0 694
michael@0 695 bool peekIsDigit()
michael@0 696 {
michael@0 697 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
michael@0 698 }
michael@0 699
michael@0 700 unsigned peekDigit()
michael@0 701 {
michael@0 702 ASSERT(peekIsDigit());
michael@0 703 return peek() - '0';
michael@0 704 }
michael@0 705
michael@0 706 int consume()
michael@0 707 {
michael@0 708 ASSERT(m_index < m_size);
michael@0 709 return m_data[m_index++];
michael@0 710 }
michael@0 711
michael@0 712 unsigned consumeDigit()
michael@0 713 {
michael@0 714 ASSERT(peekIsDigit());
michael@0 715 return consume() - '0';
michael@0 716 }
michael@0 717
michael@0 718 bool consumeNumber(unsigned &accum)
michael@0 719 {
michael@0 720 accum = consumeDigit();
michael@0 721 while (peekIsDigit()) {
michael@0 722 unsigned newValue = accum * 10 + peekDigit();
michael@0 723 if (newValue < accum) { /* Overflow check. */
michael@0 724 m_err = QuantifierTooLarge;
michael@0 725 return false;
michael@0 726 }
michael@0 727 accum = newValue;
michael@0 728 consume();
michael@0 729 }
michael@0 730 return true;
michael@0 731 }
michael@0 732
michael@0 733 unsigned consumeOctal()
michael@0 734 {
michael@0 735 ASSERT(WTF::isASCIIOctalDigit(peek()));
michael@0 736
michael@0 737 unsigned n = consumeDigit();
michael@0 738 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
michael@0 739 n = n * 8 + consumeDigit();
michael@0 740 return n;
michael@0 741 }
michael@0 742
michael@0 743 bool tryConsume(UChar ch)
michael@0 744 {
michael@0 745 if (atEndOfPattern() || (m_data[m_index] != ch))
michael@0 746 return false;
michael@0 747 ++m_index;
michael@0 748 return true;
michael@0 749 }
michael@0 750
michael@0 751 int tryConsumeHex(int count)
michael@0 752 {
michael@0 753 ParseState state = saveState();
michael@0 754
michael@0 755 int n = 0;
michael@0 756 while (count--) {
michael@0 757 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
michael@0 758 restoreState(state);
michael@0 759 return -1;
michael@0 760 }
michael@0 761 n = (n << 4) | WTF::toASCIIHexValue(consume());
michael@0 762 }
michael@0 763 return n;
michael@0 764 }
michael@0 765
michael@0 766 Delegate& m_delegate;
michael@0 767 unsigned m_backReferenceLimit;
michael@0 768 ErrorCode m_err;
michael@0 769 const CharType* m_data;
michael@0 770 unsigned m_size;
michael@0 771 unsigned m_index;
michael@0 772 unsigned m_parenthesesNestingDepth;
michael@0 773
michael@0 774 // Derived by empirical testing of compile time in PCRE and WREC.
michael@0 775 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
michael@0 776 };
michael@0 777
michael@0 778 /*
michael@0 779 * Yarr::parse():
michael@0 780 *
michael@0 781 * The parse method is passed a pattern to be parsed and a delegate upon which
michael@0 782 * callbacks will be made to record the parsed tokens forming the regex.
michael@0 783 * Yarr::parse() returns null on success, or a const C string providing an error
michael@0 784 * message where a parse error occurs.
michael@0 785 *
michael@0 786 * The Delegate must implement the following interface:
michael@0 787 *
michael@0 788 * void assertionBOL();
michael@0 789 * void assertionEOL();
michael@0 790 * void assertionWordBoundary(bool invert);
michael@0 791 *
michael@0 792 * void atomPatternCharacter(UChar ch);
michael@0 793 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
michael@0 794 * void atomCharacterClassBegin(bool invert)
michael@0 795 * void atomCharacterClassAtom(UChar ch)
michael@0 796 * void atomCharacterClassRange(UChar begin, UChar end)
michael@0 797 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
michael@0 798 * void atomCharacterClassEnd()
michael@0 799 * void atomParenthesesSubpatternBegin(bool capture = true);
michael@0 800 * void atomParentheticalAssertionBegin(bool invert = false);
michael@0 801 * void atomParenthesesEnd();
michael@0 802 * void atomBackReference(unsigned subpatternId);
michael@0 803 *
michael@0 804 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
michael@0 805 *
michael@0 806 * void disjunction();
michael@0 807 *
michael@0 808 * The regular expression is described by a sequence of assertion*() and atom*()
michael@0 809 * callbacks to the delegate, describing the terms in the regular expression.
michael@0 810 * Following an atom a quantifyAtom() call may occur to indicate that the previous
michael@0 811 * atom should be quantified. In the case of atoms described across multiple
michael@0 812 * calls (parentheses and character classes) the call to quantifyAtom() will come
michael@0 813 * after the call to the atom*End() method, never after atom*Begin().
michael@0 814 *
michael@0 815 * Character classes may either be described by a single call to
michael@0 816 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
michael@0 817 * In the latter case, ...Begin() will be called, followed by a sequence of
michael@0 818 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
michael@0 819 *
michael@0 820 * Sequences of atoms and assertions are broken into alternatives via calls to
michael@0 821 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
michael@0 822 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
michael@0 823 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
michael@0 824 * capturing subpattern, this will be the subpatternId associated with these
michael@0 825 * parentheses, and will also by definition be the lowest subpatternId of these
michael@0 826 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
michael@0 827 * is passed the subpatternId of the last capturing subexpression nested within
michael@0 828 * these paretheses. In the case of a capturing subpattern with no nested
michael@0 829 * capturing subpatterns, the same subpatternId will be passed to the begin and
michael@0 830 * end functions. In the case of non-capturing subpatterns the subpatternId
michael@0 831 * passed to the begin method is also the first possible subpatternId that might
michael@0 832 * be nested within these paretheses. If a set of non-capturing parentheses does
michael@0 833 * not contain any capturing subpatterns, then the subpatternId passed to begin
michael@0 834 * will be greater than the subpatternId passed to end.
michael@0 835 */
michael@0 836
michael@0 837 template<class Delegate>
michael@0 838 ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
michael@0 839 {
michael@0 840 #ifdef YARR_8BIT_CHAR_SUPPORT
michael@0 841 if (pattern.is8Bit())
michael@0 842 return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
michael@0 843 #endif
michael@0 844 return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
michael@0 845 }
michael@0 846
michael@0 847 } } // namespace JSC::Yarr
michael@0 848
michael@0 849 #endif /* yarr_YarrParser_h */

mercurial