1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/yarr/YarrParser.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,849 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * 1.7 + * Copyright (C) 2009 Apple Inc. All rights reserved. 1.8 + * 1.9 + * Redistribution and use in source and binary forms, with or without 1.10 + * modification, are permitted provided that the following conditions 1.11 + * are met: 1.12 + * 1. Redistributions of source code must retain the above copyright 1.13 + * notice, this list of conditions and the following disclaimer. 1.14 + * 2. Redistributions in binary form must reproduce the above copyright 1.15 + * notice, this list of conditions and the following disclaimer in the 1.16 + * documentation and/or other materials provided with the distribution. 1.17 + * 1.18 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1.19 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.20 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1.21 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1.22 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.23 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.24 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 1.26 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.27 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.28 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.29 + */ 1.30 + 1.31 +#ifndef yarr_YarrParser_h 1.32 +#define yarr_YarrParser_h 1.33 + 1.34 +#include "yarr/Yarr.h" 1.35 + 1.36 +namespace JSC { namespace Yarr { 1.37 + 1.38 +enum BuiltInCharacterClassID { 1.39 + DigitClassID, 1.40 + SpaceClassID, 1.41 + WordClassID, 1.42 + NewlineClassID 1.43 +}; 1.44 + 1.45 +// The Parser class should not be used directly - only via the Yarr::parse() method. 1.46 +template<class Delegate, typename CharType> 1.47 +class Parser { 1.48 +private: 1.49 + template<class FriendDelegate> 1.50 + friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); 1.51 + 1.52 + /* 1.53 + * CharacterClassParserDelegate: 1.54 + * 1.55 + * The class CharacterClassParserDelegate is used in the parsing of character 1.56 + * classes. This class handles detection of character ranges. This class 1.57 + * implements enough of the delegate interface such that it can be passed to 1.58 + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused 1.59 + * to perform the parsing of escape characters in character sets. 1.60 + */ 1.61 + class CharacterClassParserDelegate { 1.62 + public: 1.63 + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) 1.64 + : m_delegate(delegate) 1.65 + , m_err(err) 1.66 + , m_state(Empty) 1.67 + , m_character(0) 1.68 + { 1.69 + } 1.70 + 1.71 + /* 1.72 + * begin(): 1.73 + * 1.74 + * Called at beginning of construction. 1.75 + */ 1.76 + void begin(bool invert) 1.77 + { 1.78 + m_delegate.atomCharacterClassBegin(invert); 1.79 + } 1.80 + 1.81 + /* 1.82 + * atomPatternCharacter(): 1.83 + * 1.84 + * This method is called either from parseCharacterClass() (for an unescaped 1.85 + * character in a character class), or from parseEscape(). In the former case 1.86 + * the value true will be passed for the argument 'hyphenIsRange', and in this 1.87 + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ 1.88 + * is different to /[a\-z]/). 1.89 + */ 1.90 + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) 1.91 + { 1.92 + switch (m_state) { 1.93 + case AfterCharacterClass: 1.94 + // Following a builtin character class we need look out for a hyphen. 1.95 + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. 1.96 + // If we see a hyphen following a charater class then unlike usual 1.97 + // we'll report it to the delegate immediately, and put ourself into 1.98 + // a poisoned state. Any following calls to add another character or 1.99 + // character class will result in an error. (A hypen following a 1.100 + // character-class is itself valid, but only at the end of a regex). 1.101 + if (hyphenIsRange && ch == '-') { 1.102 + m_delegate.atomCharacterClassAtom('-'); 1.103 + m_state = AfterCharacterClassHyphen; 1.104 + return; 1.105 + } 1.106 + // Otherwise just fall through - cached character so treat this as Empty. 1.107 + 1.108 + case Empty: 1.109 + m_character = ch; 1.110 + m_state = CachedCharacter; 1.111 + return; 1.112 + 1.113 + case CachedCharacter: 1.114 + if (hyphenIsRange && ch == '-') 1.115 + m_state = CachedCharacterHyphen; 1.116 + else { 1.117 + m_delegate.atomCharacterClassAtom(m_character); 1.118 + m_character = ch; 1.119 + } 1.120 + return; 1.121 + 1.122 + case CachedCharacterHyphen: 1.123 + if (ch < m_character) { 1.124 + m_err = CharacterClassOutOfOrder; 1.125 + return; 1.126 + } 1.127 + m_delegate.atomCharacterClassRange(m_character, ch); 1.128 + m_state = Empty; 1.129 + return; 1.130 + 1.131 + case AfterCharacterClassHyphen: 1.132 + m_delegate.atomCharacterClassAtom(ch); 1.133 + m_state = Empty; 1.134 + return; 1.135 + } 1.136 + } 1.137 + 1.138 + /* 1.139 + * atomBuiltInCharacterClass(): 1.140 + * 1.141 + * Adds a built-in character class, called by parseEscape(). 1.142 + */ 1.143 + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 1.144 + { 1.145 + switch (m_state) { 1.146 + case CachedCharacter: 1.147 + // Flush the currently cached character, then fall through. 1.148 + m_delegate.atomCharacterClassAtom(m_character); 1.149 + 1.150 + case Empty: 1.151 + case AfterCharacterClass: 1.152 + m_state = AfterCharacterClass; 1.153 + m_delegate.atomCharacterClassBuiltIn(classID, invert); 1.154 + return; 1.155 + 1.156 + case CachedCharacterHyphen: 1.157 + // Error! We have a range that looks like [x-\d]. We require 1.158 + // the end of the range to be a single character. 1.159 + m_err = CharacterClassInvalidRange; 1.160 + return; 1.161 + case AfterCharacterClassHyphen: 1.162 + m_delegate.atomCharacterClassBuiltIn(classID, invert); 1.163 + m_state = Empty; 1.164 + return; 1.165 + } 1.166 + } 1.167 + 1.168 + /* 1.169 + * end(): 1.170 + * 1.171 + * Called at end of construction. 1.172 + */ 1.173 + void end() 1.174 + { 1.175 + if (m_state == CachedCharacter) 1.176 + m_delegate.atomCharacterClassAtom(m_character); 1.177 + else if (m_state == CachedCharacterHyphen) { 1.178 + m_delegate.atomCharacterClassAtom(m_character); 1.179 + m_delegate.atomCharacterClassAtom('-'); 1.180 + } 1.181 + m_delegate.atomCharacterClassEnd(); 1.182 + } 1.183 + 1.184 + // parseEscape() should never call these delegate methods when 1.185 + // invoked with inCharacterClass set. 1.186 + NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } 1.187 + NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } 1.188 + 1.189 + private: 1.190 + Delegate& m_delegate; 1.191 + ErrorCode& m_err; 1.192 + enum CharacterClassConstructionState { 1.193 + Empty, 1.194 + CachedCharacter, 1.195 + CachedCharacterHyphen, 1.196 + AfterCharacterClass, 1.197 + AfterCharacterClassHyphen 1.198 + } m_state; 1.199 + UChar m_character; 1.200 + }; 1.201 + 1.202 + Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) 1.203 + : m_delegate(delegate) 1.204 + , m_backReferenceLimit(backReferenceLimit) 1.205 + , m_err(NoError) 1.206 + , m_data(pattern.chars()) 1.207 + , m_size(pattern.length()) 1.208 + , m_index(0) 1.209 + , m_parenthesesNestingDepth(0) 1.210 + { 1.211 + } 1.212 + 1.213 + /* 1.214 + * parseEscape(): 1.215 + * 1.216 + * Helper for parseTokens() AND parseCharacterClass(). 1.217 + * Unlike the other parser methods, this function does not report tokens 1.218 + * directly to the member delegate (m_delegate), instead tokens are 1.219 + * emitted to the delegate provided as an argument. In the case of atom 1.220 + * escapes, parseTokens() will call parseEscape() passing m_delegate as 1.221 + * an argument, and as such the escape will be reported to the delegate. 1.222 + * 1.223 + * However this method may also be used by parseCharacterClass(), in which 1.224 + * case a CharacterClassParserDelegate will be passed as the delegate that 1.225 + * tokens should be added to. A boolean flag is also provided to indicate 1.226 + * whether that an escape in a CharacterClass is being parsed (some parsing 1.227 + * rules change in this context). 1.228 + * 1.229 + * The boolean value returned by this method indicates whether the token 1.230 + * parsed was an atom (outside of a characted class \b and \B will be 1.231 + * interpreted as assertions). 1.232 + */ 1.233 + template<bool inCharacterClass, class EscapeDelegate> 1.234 + bool parseEscape(EscapeDelegate& delegate) 1.235 + { 1.236 + ASSERT(!m_err); 1.237 + ASSERT(peek() == '\\'); 1.238 + consume(); 1.239 + 1.240 + if (atEndOfPattern()) { 1.241 + m_err = EscapeUnterminated; 1.242 + return false; 1.243 + } 1.244 + 1.245 + switch (peek()) { 1.246 + // Assertions 1.247 + case 'b': 1.248 + consume(); 1.249 + if (inCharacterClass) 1.250 + delegate.atomPatternCharacter('\b'); 1.251 + else { 1.252 + delegate.assertionWordBoundary(false); 1.253 + return false; 1.254 + } 1.255 + break; 1.256 + case 'B': 1.257 + consume(); 1.258 + if (inCharacterClass) 1.259 + delegate.atomPatternCharacter('B'); 1.260 + else { 1.261 + delegate.assertionWordBoundary(true); 1.262 + return false; 1.263 + } 1.264 + break; 1.265 + 1.266 + // CharacterClassEscape 1.267 + case 'd': 1.268 + consume(); 1.269 + delegate.atomBuiltInCharacterClass(DigitClassID, false); 1.270 + break; 1.271 + case 's': 1.272 + consume(); 1.273 + delegate.atomBuiltInCharacterClass(SpaceClassID, false); 1.274 + break; 1.275 + case 'w': 1.276 + consume(); 1.277 + delegate.atomBuiltInCharacterClass(WordClassID, false); 1.278 + break; 1.279 + case 'D': 1.280 + consume(); 1.281 + delegate.atomBuiltInCharacterClass(DigitClassID, true); 1.282 + break; 1.283 + case 'S': 1.284 + consume(); 1.285 + delegate.atomBuiltInCharacterClass(SpaceClassID, true); 1.286 + break; 1.287 + case 'W': 1.288 + consume(); 1.289 + delegate.atomBuiltInCharacterClass(WordClassID, true); 1.290 + break; 1.291 + 1.292 + // DecimalEscape 1.293 + case '1': 1.294 + case '2': 1.295 + case '3': 1.296 + case '4': 1.297 + case '5': 1.298 + case '6': 1.299 + case '7': 1.300 + case '8': 1.301 + case '9': { 1.302 + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. 1.303 + // First, try to parse this as backreference. 1.304 + if (!inCharacterClass) { 1.305 + ParseState state = saveState(); 1.306 + 1.307 + unsigned backReference; 1.308 + if (!consumeNumber(backReference)) 1.309 + break; 1.310 + if (backReference <= m_backReferenceLimit) { 1.311 + delegate.atomBackReference(backReference); 1.312 + break; 1.313 + } 1.314 + 1.315 + restoreState(state); 1.316 + } 1.317 + 1.318 + // Not a backreference, and not octal. 1.319 + if (peek() >= '8') { 1.320 + delegate.atomPatternCharacter('\\'); 1.321 + break; 1.322 + } 1.323 + 1.324 + // Fall-through to handle this as an octal escape. 1.325 + } 1.326 + 1.327 + // Octal escape 1.328 + case '0': 1.329 + delegate.atomPatternCharacter(consumeOctal()); 1.330 + break; 1.331 + 1.332 + // ControlEscape 1.333 + case 'f': 1.334 + consume(); 1.335 + delegate.atomPatternCharacter('\f'); 1.336 + break; 1.337 + case 'n': 1.338 + consume(); 1.339 + delegate.atomPatternCharacter('\n'); 1.340 + break; 1.341 + case 'r': 1.342 + consume(); 1.343 + delegate.atomPatternCharacter('\r'); 1.344 + break; 1.345 + case 't': 1.346 + consume(); 1.347 + delegate.atomPatternCharacter('\t'); 1.348 + break; 1.349 + case 'v': 1.350 + consume(); 1.351 + delegate.atomPatternCharacter('\v'); 1.352 + break; 1.353 + 1.354 + // ControlLetter 1.355 + case 'c': { 1.356 + ParseState state = saveState(); 1.357 + consume(); 1.358 + if (!atEndOfPattern()) { 1.359 + int control = consume(); 1.360 + 1.361 + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. 1.362 + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { 1.363 + delegate.atomPatternCharacter(control & 0x1f); 1.364 + break; 1.365 + } 1.366 + } 1.367 + restoreState(state); 1.368 + delegate.atomPatternCharacter('\\'); 1.369 + break; 1.370 + } 1.371 + 1.372 + // HexEscape 1.373 + case 'x': { 1.374 + consume(); 1.375 + int x = tryConsumeHex(2); 1.376 + if (x == -1) 1.377 + delegate.atomPatternCharacter('x'); 1.378 + else 1.379 + delegate.atomPatternCharacter(x); 1.380 + break; 1.381 + } 1.382 + 1.383 + // UnicodeEscape 1.384 + case 'u': { 1.385 + consume(); 1.386 + int u = tryConsumeHex(4); 1.387 + if (u == -1) 1.388 + delegate.atomPatternCharacter('u'); 1.389 + else 1.390 + delegate.atomPatternCharacter(u); 1.391 + break; 1.392 + } 1.393 + 1.394 + // IdentityEscape 1.395 + default: 1.396 + delegate.atomPatternCharacter(consume()); 1.397 + } 1.398 + 1.399 + return true; 1.400 + } 1.401 + 1.402 + /* 1.403 + * parseAtomEscape(), parseCharacterClassEscape(): 1.404 + * 1.405 + * These methods alias to parseEscape(). 1.406 + */ 1.407 + bool parseAtomEscape() 1.408 + { 1.409 + return parseEscape<false>(m_delegate); 1.410 + } 1.411 + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) 1.412 + { 1.413 + parseEscape<true>(delegate); 1.414 + } 1.415 + 1.416 + /* 1.417 + * parseCharacterClass(): 1.418 + * 1.419 + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) 1.420 + * to an instance of CharacterClassParserDelegate, to describe the character class to the 1.421 + * delegate. 1.422 + */ 1.423 + void parseCharacterClass() 1.424 + { 1.425 + ASSERT(!m_err); 1.426 + ASSERT(peek() == '['); 1.427 + consume(); 1.428 + 1.429 + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); 1.430 + 1.431 + characterClassConstructor.begin(tryConsume('^')); 1.432 + 1.433 + while (!atEndOfPattern()) { 1.434 + switch (peek()) { 1.435 + case ']': 1.436 + consume(); 1.437 + characterClassConstructor.end(); 1.438 + return; 1.439 + 1.440 + case '\\': 1.441 + parseCharacterClassEscape(characterClassConstructor); 1.442 + break; 1.443 + 1.444 + default: 1.445 + characterClassConstructor.atomPatternCharacter(consume(), true); 1.446 + } 1.447 + 1.448 + if (m_err) 1.449 + return; 1.450 + } 1.451 + 1.452 + m_err = CharacterClassUnmatched; 1.453 + } 1.454 + 1.455 + /* 1.456 + * parseParenthesesBegin(): 1.457 + * 1.458 + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. 1.459 + */ 1.460 + void parseParenthesesBegin() 1.461 + { 1.462 + ASSERT(!m_err); 1.463 + ASSERT(peek() == '('); 1.464 + consume(); 1.465 + 1.466 + if (tryConsume('?')) { 1.467 + if (atEndOfPattern()) { 1.468 + m_err = ParenthesesTypeInvalid; 1.469 + return; 1.470 + } 1.471 + 1.472 + switch (consume()) { 1.473 + case ':': 1.474 + m_delegate.atomParenthesesSubpatternBegin(false); 1.475 + break; 1.476 + 1.477 + case '=': 1.478 + m_delegate.atomParentheticalAssertionBegin(); 1.479 + break; 1.480 + 1.481 + case '!': 1.482 + m_delegate.atomParentheticalAssertionBegin(true); 1.483 + break; 1.484 + 1.485 + default: 1.486 + m_err = ParenthesesTypeInvalid; 1.487 + } 1.488 + } else 1.489 + m_delegate.atomParenthesesSubpatternBegin(); 1.490 + 1.491 + ++m_parenthesesNestingDepth; 1.492 + } 1.493 + 1.494 + /* 1.495 + * parseParenthesesEnd(): 1.496 + * 1.497 + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). 1.498 + */ 1.499 + void parseParenthesesEnd() 1.500 + { 1.501 + ASSERT(!m_err); 1.502 + ASSERT(peek() == ')'); 1.503 + consume(); 1.504 + 1.505 + if (m_parenthesesNestingDepth > 0) 1.506 + m_delegate.atomParenthesesEnd(); 1.507 + else 1.508 + m_err = ParenthesesUnmatched; 1.509 + 1.510 + --m_parenthesesNestingDepth; 1.511 + } 1.512 + 1.513 + /* 1.514 + * parseQuantifier(): 1.515 + * 1.516 + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. 1.517 + */ 1.518 + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) 1.519 + { 1.520 + ASSERT(!m_err); 1.521 + ASSERT(min <= max); 1.522 + 1.523 + if (min == UINT_MAX) { 1.524 + m_err = QuantifierTooLarge; 1.525 + return; 1.526 + } 1.527 + 1.528 + if (lastTokenWasAnAtom) 1.529 + m_delegate.quantifyAtom(min, max, !tryConsume('?')); 1.530 + else 1.531 + m_err = QuantifierWithoutAtom; 1.532 + } 1.533 + 1.534 + /* 1.535 + * parseTokens(): 1.536 + * 1.537 + * This method loops over the input pattern reporting tokens to the delegate. 1.538 + * The method returns when a parse error is detected, or the end of the pattern 1.539 + * is reached. One piece of state is tracked around the loop, which is whether 1.540 + * the last token passed to the delegate was an atom (this is necessary to detect 1.541 + * a parse error when a quantifier provided without an atom to quantify). 1.542 + */ 1.543 + void parseTokens() 1.544 + { 1.545 + bool lastTokenWasAnAtom = false; 1.546 + 1.547 + while (!atEndOfPattern()) { 1.548 + switch (peek()) { 1.549 + case '|': 1.550 + consume(); 1.551 + m_delegate.disjunction(); 1.552 + lastTokenWasAnAtom = false; 1.553 + break; 1.554 + 1.555 + case '(': 1.556 + parseParenthesesBegin(); 1.557 + lastTokenWasAnAtom = false; 1.558 + break; 1.559 + 1.560 + case ')': 1.561 + parseParenthesesEnd(); 1.562 + lastTokenWasAnAtom = true; 1.563 + break; 1.564 + 1.565 + case '^': 1.566 + consume(); 1.567 + m_delegate.assertionBOL(); 1.568 + lastTokenWasAnAtom = false; 1.569 + break; 1.570 + 1.571 + case '$': 1.572 + consume(); 1.573 + m_delegate.assertionEOL(); 1.574 + lastTokenWasAnAtom = false; 1.575 + break; 1.576 + 1.577 + case '.': 1.578 + consume(); 1.579 + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); 1.580 + lastTokenWasAnAtom = true; 1.581 + break; 1.582 + 1.583 + case '[': 1.584 + parseCharacterClass(); 1.585 + lastTokenWasAnAtom = true; 1.586 + break; 1.587 + 1.588 + case '\\': 1.589 + lastTokenWasAnAtom = parseAtomEscape(); 1.590 + break; 1.591 + 1.592 + case '*': 1.593 + consume(); 1.594 + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); 1.595 + lastTokenWasAnAtom = false; 1.596 + break; 1.597 + 1.598 + case '+': 1.599 + consume(); 1.600 + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); 1.601 + lastTokenWasAnAtom = false; 1.602 + break; 1.603 + 1.604 + case '?': 1.605 + consume(); 1.606 + parseQuantifier(lastTokenWasAnAtom, 0, 1); 1.607 + lastTokenWasAnAtom = false; 1.608 + break; 1.609 + 1.610 + case '{': { 1.611 + ParseState state = saveState(); 1.612 + 1.613 + consume(); 1.614 + if (peekIsDigit()) { 1.615 + unsigned min; 1.616 + if (!consumeNumber(min)) 1.617 + break; 1.618 + 1.619 + unsigned max = min; 1.620 + if (tryConsume(',')) { 1.621 + if (peekIsDigit()) { 1.622 + if (!consumeNumber(max)) 1.623 + break; 1.624 + } else { 1.625 + max = quantifyInfinite; 1.626 + } 1.627 + } 1.628 + 1.629 + if (tryConsume('}')) { 1.630 + if (min <= max) 1.631 + parseQuantifier(lastTokenWasAnAtom, min, max); 1.632 + else 1.633 + m_err = QuantifierOutOfOrder; 1.634 + lastTokenWasAnAtom = false; 1.635 + break; 1.636 + } 1.637 + } 1.638 + 1.639 + restoreState(state); 1.640 + } // if we did not find a complete quantifer, fall through to the default case. 1.641 + 1.642 + default: 1.643 + m_delegate.atomPatternCharacter(consume()); 1.644 + lastTokenWasAnAtom = true; 1.645 + } 1.646 + 1.647 + if (m_err) 1.648 + return; 1.649 + } 1.650 + 1.651 + if (m_parenthesesNestingDepth > 0) 1.652 + m_err = MissingParentheses; 1.653 + } 1.654 + 1.655 + /* 1.656 + * parse(): 1.657 + * 1.658 + * This method calls parseTokens() to parse over the input and converts any 1.659 + * error code to a const char* for a result. 1.660 + */ 1.661 + ErrorCode parse() 1.662 + { 1.663 + if (m_size > MAX_PATTERN_SIZE) 1.664 + m_err = PatternTooLarge; 1.665 + else 1.666 + parseTokens(); 1.667 + ASSERT(atEndOfPattern() || m_err); 1.668 + 1.669 + return m_err; 1.670 + } 1.671 + 1.672 + // Misc helper functions: 1.673 + 1.674 + typedef unsigned ParseState; 1.675 + 1.676 + ParseState saveState() 1.677 + { 1.678 + return m_index; 1.679 + } 1.680 + 1.681 + void restoreState(ParseState state) 1.682 + { 1.683 + m_index = state; 1.684 + } 1.685 + 1.686 + bool atEndOfPattern() 1.687 + { 1.688 + ASSERT(m_index <= m_size); 1.689 + return m_index == m_size; 1.690 + } 1.691 + 1.692 + int peek() 1.693 + { 1.694 + ASSERT(m_index < m_size); 1.695 + return m_data[m_index]; 1.696 + } 1.697 + 1.698 + bool peekIsDigit() 1.699 + { 1.700 + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); 1.701 + } 1.702 + 1.703 + unsigned peekDigit() 1.704 + { 1.705 + ASSERT(peekIsDigit()); 1.706 + return peek() - '0'; 1.707 + } 1.708 + 1.709 + int consume() 1.710 + { 1.711 + ASSERT(m_index < m_size); 1.712 + return m_data[m_index++]; 1.713 + } 1.714 + 1.715 + unsigned consumeDigit() 1.716 + { 1.717 + ASSERT(peekIsDigit()); 1.718 + return consume() - '0'; 1.719 + } 1.720 + 1.721 + bool consumeNumber(unsigned &accum) 1.722 + { 1.723 + accum = consumeDigit(); 1.724 + while (peekIsDigit()) { 1.725 + unsigned newValue = accum * 10 + peekDigit(); 1.726 + if (newValue < accum) { /* Overflow check. */ 1.727 + m_err = QuantifierTooLarge; 1.728 + return false; 1.729 + } 1.730 + accum = newValue; 1.731 + consume(); 1.732 + } 1.733 + return true; 1.734 + } 1.735 + 1.736 + unsigned consumeOctal() 1.737 + { 1.738 + ASSERT(WTF::isASCIIOctalDigit(peek())); 1.739 + 1.740 + unsigned n = consumeDigit(); 1.741 + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) 1.742 + n = n * 8 + consumeDigit(); 1.743 + return n; 1.744 + } 1.745 + 1.746 + bool tryConsume(UChar ch) 1.747 + { 1.748 + if (atEndOfPattern() || (m_data[m_index] != ch)) 1.749 + return false; 1.750 + ++m_index; 1.751 + return true; 1.752 + } 1.753 + 1.754 + int tryConsumeHex(int count) 1.755 + { 1.756 + ParseState state = saveState(); 1.757 + 1.758 + int n = 0; 1.759 + while (count--) { 1.760 + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { 1.761 + restoreState(state); 1.762 + return -1; 1.763 + } 1.764 + n = (n << 4) | WTF::toASCIIHexValue(consume()); 1.765 + } 1.766 + return n; 1.767 + } 1.768 + 1.769 + Delegate& m_delegate; 1.770 + unsigned m_backReferenceLimit; 1.771 + ErrorCode m_err; 1.772 + const CharType* m_data; 1.773 + unsigned m_size; 1.774 + unsigned m_index; 1.775 + unsigned m_parenthesesNestingDepth; 1.776 + 1.777 + // Derived by empirical testing of compile time in PCRE and WREC. 1.778 + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; 1.779 +}; 1.780 + 1.781 +/* 1.782 + * Yarr::parse(): 1.783 + * 1.784 + * The parse method is passed a pattern to be parsed and a delegate upon which 1.785 + * callbacks will be made to record the parsed tokens forming the regex. 1.786 + * Yarr::parse() returns null on success, or a const C string providing an error 1.787 + * message where a parse error occurs. 1.788 + * 1.789 + * The Delegate must implement the following interface: 1.790 + * 1.791 + * void assertionBOL(); 1.792 + * void assertionEOL(); 1.793 + * void assertionWordBoundary(bool invert); 1.794 + * 1.795 + * void atomPatternCharacter(UChar ch); 1.796 + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); 1.797 + * void atomCharacterClassBegin(bool invert) 1.798 + * void atomCharacterClassAtom(UChar ch) 1.799 + * void atomCharacterClassRange(UChar begin, UChar end) 1.800 + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 1.801 + * void atomCharacterClassEnd() 1.802 + * void atomParenthesesSubpatternBegin(bool capture = true); 1.803 + * void atomParentheticalAssertionBegin(bool invert = false); 1.804 + * void atomParenthesesEnd(); 1.805 + * void atomBackReference(unsigned subpatternId); 1.806 + * 1.807 + * void quantifyAtom(unsigned min, unsigned max, bool greedy); 1.808 + * 1.809 + * void disjunction(); 1.810 + * 1.811 + * The regular expression is described by a sequence of assertion*() and atom*() 1.812 + * callbacks to the delegate, describing the terms in the regular expression. 1.813 + * Following an atom a quantifyAtom() call may occur to indicate that the previous 1.814 + * atom should be quantified. In the case of atoms described across multiple 1.815 + * calls (parentheses and character classes) the call to quantifyAtom() will come 1.816 + * after the call to the atom*End() method, never after atom*Begin(). 1.817 + * 1.818 + * Character classes may either be described by a single call to 1.819 + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. 1.820 + * In the latter case, ...Begin() will be called, followed by a sequence of 1.821 + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). 1.822 + * 1.823 + * Sequences of atoms and assertions are broken into alternatives via calls to 1.824 + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to 1.825 + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. 1.826 + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular 1.827 + * capturing subpattern, this will be the subpatternId associated with these 1.828 + * parentheses, and will also by definition be the lowest subpatternId of these 1.829 + * parentheses and of any nested paretheses. The atomParenthesesEnd() method 1.830 + * is passed the subpatternId of the last capturing subexpression nested within 1.831 + * these paretheses. In the case of a capturing subpattern with no nested 1.832 + * capturing subpatterns, the same subpatternId will be passed to the begin and 1.833 + * end functions. In the case of non-capturing subpatterns the subpatternId 1.834 + * passed to the begin method is also the first possible subpatternId that might 1.835 + * be nested within these paretheses. If a set of non-capturing parentheses does 1.836 + * not contain any capturing subpatterns, then the subpatternId passed to begin 1.837 + * will be greater than the subpatternId passed to end. 1.838 + */ 1.839 + 1.840 +template<class Delegate> 1.841 +ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) 1.842 +{ 1.843 +#ifdef YARR_8BIT_CHAR_SUPPORT 1.844 + if (pattern.is8Bit()) 1.845 + return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); 1.846 +#endif 1.847 + return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); 1.848 +} 1.849 + 1.850 +} } // namespace JSC::Yarr 1.851 + 1.852 +#endif /* yarr_YarrParser_h */